provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::compiled_binary::{
3    is_supported_compiled_binary_format, try_parse_compiled_bytes,
4};
5use crate::parsers::try_parse_file;
6use crate::parsers::windows_executable::try_parse_windows_executable_bytes;
7
8use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
9use crate::utils::text::{
10    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
11};
12use anyhow::Error;
13use rayon::prelude::*;
14use std::collections::HashSet;
15use std::fs::{self, File};
16use std::io::{Read, Write};
17use std::path::Path;
18use std::sync::Arc;
19use std::time::{Duration, Instant};
20
21use crate::copyright::{
22    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
23};
24use crate::finder::{self, DetectionConfig};
25use crate::license_detection::PositionSet;
26use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
27use crate::license_detection::query::Query;
28use crate::models::{
29    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
30    LineNumber, Match, OutputEmail, OutputURL, Sha256Digest,
31};
32use crate::parsers::utils::split_name_email;
33use crate::progress::ScanProgress;
34use crate::scanner::collect::CollectedPaths;
35use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
36use crate::utils::file::{
37    ExtractedTextKind, augment_license_detection_text, classify_file_info,
38    extract_text_for_detection_with_diagnostics, get_creation_date,
39};
40use crate::utils::generated::generated_code_hints_from_bytes;
41use tempfile::TempDir;
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq)]
44pub enum MemoryMode {
45    CollectFirst,
46    StreamUnlimited,
47    Limit(usize),
48}
49
50impl std::fmt::Display for MemoryMode {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        match self {
53            MemoryMode::CollectFirst => write!(f, "0"),
54            MemoryMode::StreamUnlimited => write!(f, "-1"),
55            MemoryMode::Limit(n) => write!(f, "{n}"),
56        }
57    }
58}
59
60const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
61    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
62    (
63        "-----BEGIN TRUSTED CERTIFICATE-----",
64        "-----END TRUSTED CERTIFICATE-----",
65    ),
66];
67
68const LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES: usize = 128 * 1024;
69
70pub fn process_collected(
71    collected: &CollectedPaths,
72    progress: Arc<ScanProgress>,
73    license_engine: Option<Arc<LicenseDetectionEngine>>,
74    license_options: LicenseScanOptions,
75    text_options: &TextDetectionOptions,
76) -> ProcessResult {
77    let mut all_files: Vec<FileInfo> = collected
78        .files
79        .par_iter()
80        .map(|(path, metadata)| {
81            let file_entry = process_file(
82                path,
83                metadata,
84                progress.as_ref(),
85                license_engine.clone(),
86                license_options,
87                text_options,
88            );
89            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
90            file_entry
91        })
92        .collect();
93
94    for (path, metadata) in &collected.directories {
95        all_files.push(process_directory(
96            path,
97            metadata,
98            text_options.collect_info,
99            license_engine.is_some(),
100        ));
101    }
102
103    ProcessResult {
104        files: all_files,
105        excluded_count: collected.excluded_count,
106    }
107}
108
109pub fn process_collected_sequential(
110    collected: &CollectedPaths,
111    progress: Arc<ScanProgress>,
112    license_engine: Option<Arc<LicenseDetectionEngine>>,
113    license_options: LicenseScanOptions,
114    text_options: &TextDetectionOptions,
115) -> ProcessResult {
116    let mut all_files: Vec<FileInfo> =
117        Vec::with_capacity(collected.files.len() + collected.directories.len());
118
119    for (path, metadata) in &collected.files {
120        let file_entry = process_file(
121            path,
122            metadata,
123            progress.as_ref(),
124            license_engine.clone(),
125            license_options,
126            text_options,
127        );
128        progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
129        all_files.push(file_entry);
130    }
131
132    for (path, metadata) in &collected.directories {
133        all_files.push(process_directory(
134            path,
135            metadata,
136            text_options.collect_info,
137            license_engine.is_some(),
138        ));
139    }
140
141    ProcessResult {
142        files: all_files,
143        excluded_count: collected.excluded_count,
144    }
145}
146
147pub fn process_collected_with_memory_limit(
148    collected: &CollectedPaths,
149    progress: Arc<ScanProgress>,
150    license_engine: Option<Arc<LicenseDetectionEngine>>,
151    license_options: LicenseScanOptions,
152    text_options: &TextDetectionOptions,
153    max_in_memory: MemoryMode,
154) -> ProcessResult {
155    match max_in_memory {
156        MemoryMode::CollectFirst => {
157            return process_collected(
158                collected,
159                progress,
160                license_engine,
161                license_options,
162                text_options,
163            );
164        }
165        MemoryMode::StreamUnlimited => {}
166        MemoryMode::Limit(_) => {}
167    }
168
169    let (memory_limit, chunk_size) = match max_in_memory {
170        MemoryMode::CollectFirst => unreachable!(),
171        MemoryMode::StreamUnlimited => (0, 256),
172        MemoryMode::Limit(n) => (n, n.max(1)),
173    };
174
175    let mut retained_files = Vec::new();
176    let mut spill_store = None;
177
178    for chunk in collected.files.chunks(chunk_size) {
179        let processed_chunk: Vec<FileInfo> = chunk
180            .par_iter()
181            .map(|(path, metadata)| {
182                let file_entry = process_file(
183                    path,
184                    metadata,
185                    progress.as_ref(),
186                    license_engine.clone(),
187                    license_options,
188                    text_options,
189                );
190                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
191                file_entry
192            })
193            .collect();
194
195        retain_or_spill_chunk(
196            processed_chunk,
197            &mut retained_files,
198            &mut spill_store,
199            memory_limit,
200        );
201    }
202
203    for (path, metadata) in &collected.directories {
204        let entry = process_directory(
205            path,
206            metadata,
207            text_options.collect_info,
208            license_engine.is_some(),
209        );
210        retain_or_spill_chunk(
211            vec![entry],
212            &mut retained_files,
213            &mut spill_store,
214            memory_limit,
215        );
216    }
217
218    if let Some(spill_store) = spill_store {
219        retained_files.extend(spill_store.load_all());
220    }
221
222    ProcessResult {
223        files: retained_files,
224        excluded_count: collected.excluded_count,
225    }
226}
227
228pub fn process_collected_with_memory_limit_sequential(
229    collected: &CollectedPaths,
230    progress: Arc<ScanProgress>,
231    license_engine: Option<Arc<LicenseDetectionEngine>>,
232    license_options: LicenseScanOptions,
233    text_options: &TextDetectionOptions,
234    max_in_memory: MemoryMode,
235) -> ProcessResult {
236    match max_in_memory {
237        MemoryMode::CollectFirst => {
238            return process_collected_sequential(
239                collected,
240                progress,
241                license_engine,
242                license_options,
243                text_options,
244            );
245        }
246        MemoryMode::StreamUnlimited => {}
247        MemoryMode::Limit(_) => {}
248    }
249
250    let (memory_limit, chunk_size) = match max_in_memory {
251        MemoryMode::CollectFirst => unreachable!(),
252        MemoryMode::StreamUnlimited => (0, 256),
253        MemoryMode::Limit(n) => (n, n.max(1)),
254    };
255
256    let mut retained_files = Vec::new();
257    let mut spill_store = None;
258
259    for chunk in collected.files.chunks(chunk_size) {
260        let mut processed_chunk: Vec<FileInfo> = Vec::with_capacity(chunk.len());
261        for (path, metadata) in chunk {
262            let file_entry = process_file(
263                path,
264                metadata,
265                progress.as_ref(),
266                license_engine.clone(),
267                license_options,
268                text_options,
269            );
270            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
271            processed_chunk.push(file_entry);
272        }
273
274        retain_or_spill_chunk(
275            processed_chunk,
276            &mut retained_files,
277            &mut spill_store,
278            memory_limit,
279        );
280    }
281
282    for (path, metadata) in &collected.directories {
283        let entry = process_directory(
284            path,
285            metadata,
286            text_options.collect_info,
287            license_engine.is_some(),
288        );
289        retain_or_spill_chunk(
290            vec![entry],
291            &mut retained_files,
292            &mut spill_store,
293            memory_limit,
294        );
295    }
296
297    if let Some(spill_store) = spill_store {
298        retained_files.extend(spill_store.load_all());
299    }
300
301    ProcessResult {
302        files: retained_files,
303        excluded_count: collected.excluded_count,
304    }
305}
306
307fn retain_or_spill_chunk(
308    chunk: Vec<FileInfo>,
309    retained_files: &mut Vec<FileInfo>,
310    spill_store: &mut Option<FileInfoSpillStore>,
311    memory_limit: usize,
312) {
313    if memory_limit == 0 {
314        spill_store
315            .get_or_insert_with(FileInfoSpillStore::new)
316            .spill(chunk);
317        return;
318    }
319
320    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
321    if remaining_capacity >= chunk.len() && spill_store.is_none() {
322        retained_files.extend(chunk);
323        return;
324    }
325
326    let mut chunk_iter = chunk.into_iter();
327    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
328    let overflow: Vec<FileInfo> = chunk_iter.collect();
329    if !overflow.is_empty() {
330        spill_store
331            .get_or_insert_with(FileInfoSpillStore::new)
332            .spill(overflow);
333    }
334}
335
336struct FileInfoSpillStore {
337    temp_dir: TempDir,
338    batch_index: usize,
339}
340
341impl FileInfoSpillStore {
342    fn new() -> Self {
343        Self {
344            temp_dir: TempDir::new().expect("create spill dir"),
345            batch_index: 0,
346        }
347    }
348
349    fn spill(&mut self, files: Vec<FileInfo>) {
350        let path = self
351            .temp_dir
352            .path()
353            .join(format!("batch-{:06}.json.zst", self.batch_index));
354        self.batch_index += 1;
355
356        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
357        let file = File::create(path).expect("create spill batch file");
358        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
359        encoder
360            .write_all(&payload)
361            .expect("write spilled file batch");
362        encoder.finish().expect("finish spill encoder");
363    }
364
365    fn load_all(self) -> Vec<FileInfo> {
366        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
367            .expect("read spill dir")
368            .filter_map(Result::ok)
369            .map(|entry| entry.path())
370            .collect();
371        paths.sort();
372
373        let mut files = Vec::new();
374        for path in paths {
375            let file = File::open(path).expect("open spill batch");
376            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
377            let mut payload = Vec::new();
378            decoder.read_to_end(&mut payload).expect("read spill batch");
379            let mut batch: Vec<FileInfo> =
380                serde_json::from_slice(&payload).expect("decode spilled file batch");
381            files.append(&mut batch);
382        }
383        files
384    }
385}
386
387fn process_file(
388    path: &Path,
389    metadata: &fs::Metadata,
390    progress: &ScanProgress,
391    license_engine: Option<Arc<LicenseDetectionEngine>>,
392    license_options: LicenseScanOptions,
393    text_options: &TextDetectionOptions,
394) -> FileInfo {
395    let mut scan_errors: Vec<String> = vec![];
396    let mut file_info_builder = FileInfoBuilder::default();
397    let license_enabled = license_engine.is_some();
398
399    let started = Instant::now();
400
401    let mut generated_flag = None;
402    let mut is_source_file = false;
403    match extract_information_from_content(
404        &mut file_info_builder,
405        &mut scan_errors,
406        path,
407        progress,
408        license_engine,
409        license_options,
410        text_options,
411    ) {
412        Ok((is_generated, sha256, is_source)) => {
413            generated_flag = is_generated;
414            is_source_file = is_source;
415            let _ = sha256;
416        }
417        Err(e) => scan_errors.push(e.to_string()),
418    };
419
420    maybe_record_processing_timeout(&mut scan_errors, started, text_options.timeout_seconds);
421
422    let mut file_info = file_info_builder
423        .name(path.file_name().unwrap().to_string_lossy().to_string())
424        .base_name(
425            path.file_stem()
426                .unwrap_or_default()
427                .to_string_lossy()
428                .to_string(),
429        )
430        .extension(
431            path.extension()
432                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
433        )
434        .path(path.to_string_lossy().to_string())
435        .file_type(FileType::File)
436        .size(metadata.len())
437        .date(
438            text_options
439                .collect_info
440                .then(|| get_creation_date(metadata))
441                .flatten(),
442        )
443        .scan_errors(scan_errors)
444        .build()
445        .expect("FileInformationBuild not completely initialized");
446
447    if text_options.collect_info {
448        file_info.is_source = Some(is_source_file);
449    }
450
451    if file_info.programming_language.as_deref() == Some("Go")
452        && is_go_non_production_source(path).unwrap_or(false)
453    {
454        file_info.is_source = Some(false);
455    }
456
457    if text_options.detect_generated {
458        file_info.is_generated = Some(generated_flag.unwrap_or(false));
459    }
460
461    if file_info.percentage_of_license_text.is_none() && license_enabled {
462        file_info.percentage_of_license_text = Some(0.0);
463    }
464
465    file_info
466}
467
468fn extract_information_from_content(
469    file_info_builder: &mut FileInfoBuilder,
470    scan_errors: &mut Vec<String>,
471    path: &Path,
472    progress: &ScanProgress,
473    license_engine: Option<Arc<LicenseDetectionEngine>>,
474    license_options: LicenseScanOptions,
475    text_options: &TextDetectionOptions,
476) -> Result<(Option<bool>, Sha256Digest, bool), Error> {
477    let started = Instant::now();
478    let filesystem_path = absolute_filesystem_path(path);
479    let buffer = fs::read(&filesystem_path)?;
480    let license_enabled = license_engine.is_some();
481
482    if is_timeout_exceeded(started, text_options.timeout_seconds) {
483        return Err(Error::msg(format!(
484            "Timeout while reading file content (> {:.2}s)",
485            text_options.timeout_seconds
486        )));
487    }
488
489    let sha256 = calculate_sha256(&buffer);
490    let is_generated = text_options
491        .detect_generated
492        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
493    let classification = classify_file_info(&filesystem_path, &buffer);
494
495    if text_options.collect_info {
496        file_info_builder
497            .sha1(Some(calculate_sha1(&buffer)))
498            .md5(Some(calculate_md5(&buffer)))
499            .sha256(Some(sha256))
500            .programming_language(classification.programming_language.clone())
501            .mime_type(Some(classification.mime_type.clone()))
502            .file_type_label(Some(classification.file_type.clone()))
503            .sha1_git(Some(calculate_sha1_git(&buffer)))
504            .is_binary(Some(classification.is_binary))
505            .is_text(Some(classification.is_text))
506            .is_archive(Some(classification.is_archive))
507            .is_media(Some(classification.is_media))
508            .is_source(Some(classification.is_source))
509            .is_script(Some(classification.is_script))
510            .files_count(Some(0))
511            .dirs_count(Some(0))
512            .size_count(Some(0));
513    }
514
515    if should_skip_text_detection(&filesystem_path, &buffer) {
516        return Ok((is_generated, sha256, classification.is_source));
517    }
518
519    // Package parsing and text-based detection (copyright, license) are independent.
520    // Python ScanCode runs all enabled plugins on every file, so we do the same.
521    if text_options.detect_packages {
522        let started = Instant::now();
523        let parse_result = try_parse_file(&filesystem_path)
524            .or_else(|| {
525                text_options
526                    .detect_application_packages
527                    .then(|| try_parse_windows_executable_bytes(&filesystem_path, &buffer))
528                    .flatten()
529            })
530            .or_else(|| {
531                text_options
532                    .detect_packages_in_compiled
533                    .then(|| {
534                        (classification.is_binary && is_supported_compiled_binary_format(&buffer))
535                            .then(|| try_parse_compiled_bytes(&buffer))
536                            .flatten()
537                    })
538                    .flatten()
539            });
540
541        if let Some(parse_result) = parse_result {
542            let packages = parse_result
543                .packages
544                .into_iter()
545                .filter(|package| {
546                    let is_compiled_package = package
547                        .datasource_id
548                        .as_ref()
549                        .is_some_and(is_compiled_datasource);
550                    let is_system_package = package
551                        .datasource_id
552                        .as_ref()
553                        .is_some_and(is_system_datasource);
554                    if is_compiled_package {
555                        text_options.detect_packages_in_compiled
556                    } else if is_system_package {
557                        text_options.detect_system_packages
558                    } else {
559                        text_options.detect_application_packages
560                    }
561                })
562                .collect();
563            file_info_builder.package_data(packages);
564            scan_errors.extend(parse_result.scan_errors);
565        }
566        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
567    }
568
569    if is_timeout_exceeded(started, text_options.timeout_seconds) {
570        return Err(Error::msg(format!(
571            "Timeout while extracting package/text metadata (> {:.2}s)",
572            text_options.timeout_seconds
573        )));
574    }
575
576    let (text_content, text_kind, text_scan_error) =
577        extract_text_for_detection_with_diagnostics(&filesystem_path, &buffer);
578    if let Some(text_scan_error) = text_scan_error {
579        scan_errors.push(text_scan_error);
580    }
581    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
582
583    if is_timeout_exceeded(started, text_options.timeout_seconds) {
584        return Err(Error::msg(format!(
585            "Timeout while extracting text content (> {:.2}s)",
586            text_options.timeout_seconds
587        )));
588    }
589
590    if text_content.is_empty() {
591        return Ok((is_generated, sha256, classification.is_source));
592    }
593
594    if text_options.detect_copyrights {
595        extract_copyright_information(
596            file_info_builder,
597            path,
598            &text_content,
599            text_options.timeout_seconds,
600            from_binary_strings,
601        );
602    }
603    extract_email_url_information(
604        file_info_builder,
605        &text_content,
606        text_options,
607        from_binary_strings,
608    );
609
610    if is_timeout_exceeded(started, text_options.timeout_seconds) {
611        return Err(Error::msg(format!(
612            "Timeout before license scan (> {:.2}s)",
613            text_options.timeout_seconds
614        )));
615    }
616    // Handle source map files specially
617    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
618        if let Some(sourcemap_content) =
619            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
620        {
621            sourcemap_content
622        } else {
623            text_content
624        }
625    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
626        remove_verbatim_escape_sequences(&text_content)
627    } else {
628        text_content
629    };
630    let text_content_for_license_detection =
631        augment_license_detection_text(path, &text_content_for_license_detection);
632    let text_content_for_license_detection = cap_non_source_json_license_text(
633        path,
634        &classification,
635        text_content_for_license_detection.as_ref(),
636    )
637    .into_owned();
638
639    if license_enabled {
640        let started = Instant::now();
641        extract_license_information(
642            file_info_builder,
643            scan_errors,
644            &filesystem_path,
645            text_content_for_license_detection.clone(),
646            license_engine,
647            license_options,
648            from_binary_strings,
649        )?;
650        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
651    } else {
652        extract_license_information(
653            file_info_builder,
654            scan_errors,
655            &filesystem_path,
656            text_content_for_license_detection,
657            license_engine,
658            license_options,
659            from_binary_strings,
660        )?;
661    }
662
663    if is_timeout_exceeded(started, text_options.timeout_seconds) {
664        return Err(Error::msg(format!(
665            "Timeout during license scan (> {:.2}s)",
666            text_options.timeout_seconds
667        )));
668    }
669
670    Ok((is_generated, sha256, classification.is_source))
671}
672
673fn absolute_filesystem_path(path: &Path) -> std::path::PathBuf {
674    if path.is_absolute() {
675        return path.to_path_buf();
676    }
677
678    std::env::current_dir()
679        .map(|cwd| cwd.join(path))
680        .unwrap_or_else(|_| path.to_path_buf())
681}
682
683fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
684    timeout_seconds.is_finite()
685        && timeout_seconds > 0.0
686        && started.elapsed().as_secs_f64() > timeout_seconds
687}
688
689fn maybe_record_processing_timeout(
690    scan_errors: &mut Vec<String>,
691    started: Instant,
692    timeout_seconds: f64,
693) {
694    if is_timeout_exceeded(started, timeout_seconds)
695        && !scan_errors.iter().any(|error| is_timeout_scan_error(error))
696    {
697        scan_errors.push(format!(
698            "Processing interrupted due to timeout after {:.2} seconds",
699            timeout_seconds
700        ));
701    }
702}
703
704fn is_timeout_scan_error(error: &str) -> bool {
705    error.contains("Timeout while ")
706        || error.contains("Timeout before ")
707        || error.contains("Timeout during ")
708        || error.contains("Processing interrupted due to timeout")
709}
710
711fn cap_non_source_json_license_text<'a>(
712    path: &Path,
713    classification: &crate::utils::file::FileInfoClassification,
714    text: &'a str,
715) -> std::borrow::Cow<'a, str> {
716    if classification.is_source
717        || crate::utils::sourcemap::is_sourcemap(path)
718        || !is_json_like_text(classification, path)
719        || text.len() <= LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES
720    {
721        return std::borrow::Cow::Borrowed(text);
722    }
723
724    std::borrow::Cow::Owned(
725        truncate_at_char_boundary(text, LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES).to_string(),
726    )
727}
728
729fn is_json_like_text(
730    classification: &crate::utils::file::FileInfoClassification,
731    path: &Path,
732) -> bool {
733    classification.file_type == "JSON text data"
734        || classification.mime_type == "application/json"
735        || classification.mime_type.ends_with("+json")
736        || path
737            .extension()
738            .and_then(|ext| ext.to_str())
739            .is_some_and(|ext| ext.eq_ignore_ascii_case("json"))
740}
741
742fn truncate_at_char_boundary(text: &str, max_bytes: usize) -> &str {
743    if text.len() <= max_bytes {
744        return text;
745    }
746
747    let mut end = max_bytes;
748    while end > 0 && !text.is_char_boundary(end) {
749        end -= 1;
750    }
751    &text[..end]
752}
753
754fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
755    matches!(
756        datasource_id,
757        DatasourceId::AlpineInstalledDb
758            | DatasourceId::DebianDistrolessInstalledDb
759            | DatasourceId::DebianInstalledFilesList
760            | DatasourceId::DebianInstalledMd5Sums
761            | DatasourceId::DebianInstalledStatusDb
762            | DatasourceId::FreebsdCompactManifest
763            | DatasourceId::RpmInstalledDatabaseBdb
764            | DatasourceId::RpmInstalledDatabaseNdb
765            | DatasourceId::RpmInstalledDatabaseSqlite
766            | DatasourceId::RpmYumdb
767    )
768}
769
770fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
771    matches!(
772        datasource_id,
773        DatasourceId::GoBinary | DatasourceId::RustBinary
774    )
775}
776
777fn extract_copyright_information(
778    file_info_builder: &mut FileInfoBuilder,
779    path: &Path,
780    text_content: &str,
781    timeout_seconds: f64,
782    from_binary_strings: bool,
783) {
784    // CREDITS files get special handling (Linux kernel style).
785    if copyright::is_credits_file(path) {
786        let author_detections = copyright::detect_credits_authors(text_content);
787        if !author_detections.is_empty() {
788            file_info_builder.authors(
789                author_detections
790                    .into_iter()
791                    .map(|a| Author {
792                        author: a.author,
793                        start_line: a.start_line,
794                        end_line: a.end_line,
795                    })
796                    .collect(),
797            );
798            return;
799        }
800    }
801
802    let copyright_options = CopyrightDetectionOptions {
803        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
804            Some(Duration::from_secs_f64(timeout_seconds))
805        } else {
806            None
807        },
808        ..CopyrightDetectionOptions::default()
809    };
810
811    let (copyrights, holders, authors) =
812        copyright::detect_copyrights_with_options(text_content, &copyright_options);
813    let (copyrights, holders, authors) = if from_binary_strings {
814        prune_binary_string_detections(text_content, copyrights, holders, authors)
815    } else {
816        (copyrights, holders, authors)
817    };
818
819    file_info_builder.copyrights(
820        copyrights
821            .into_iter()
822            .map(|c| Copyright {
823                copyright: c.copyright,
824                start_line: c.start_line,
825                end_line: c.end_line,
826            })
827            .collect::<Vec<Copyright>>(),
828    );
829    file_info_builder.holders(
830        holders
831            .into_iter()
832            .map(|h| Holder {
833                holder: h.holder,
834                start_line: h.start_line,
835                end_line: h.end_line,
836            })
837            .collect::<Vec<Holder>>(),
838    );
839    file_info_builder.authors(
840        authors
841            .into_iter()
842            .map(|a| Author {
843                author: a.author,
844                start_line: a.start_line,
845                end_line: a.end_line,
846            })
847            .collect::<Vec<Author>>(),
848    );
849}
850
851fn prune_binary_string_detections(
852    text_content: &str,
853    copyrights: Vec<CopyrightDetection>,
854    holders: Vec<HolderDetection>,
855    authors: Vec<AuthorDetection>,
856) -> (
857    Vec<CopyrightDetection>,
858    Vec<HolderDetection>,
859    Vec<AuthorDetection>,
860) {
861    let kept_copyrights: Vec<CopyrightDetection> = copyrights
862        .into_iter()
863        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
864        .collect();
865
866    let kept_holders: Vec<HolderDetection> = holders
867        .into_iter()
868        .filter(|holder| {
869            kept_copyrights.iter().any(|copyright| {
870                ranges_overlap(
871                    holder.start_line,
872                    holder.end_line,
873                    copyright.start_line,
874                    copyright.end_line,
875                )
876            })
877        })
878        .collect();
879
880    let kept_authors = authors
881        .into_iter()
882        .filter(|author| is_binary_string_author_candidate(&author.author))
883        .chain(extract_binary_string_author_supplements(text_content))
884        .filter({
885            let mut seen = HashSet::new();
886            move |author| seen.insert(author.author.clone())
887        })
888        .collect();
889
890    (kept_copyrights, kept_holders, kept_authors)
891}
892
893fn ranges_overlap(
894    a_start: LineNumber,
895    a_end: LineNumber,
896    b_start: LineNumber,
897    b_end: LineNumber,
898) -> bool {
899    a_start <= b_end && b_start <= a_end
900}
901
902fn is_binary_string_copyright_candidate(text: &str) -> bool {
903    if contains_year(text) {
904        return true;
905    }
906
907    let trimmed = text.trim();
908    let lower = trimmed.to_ascii_lowercase();
909    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
910        tail.trim()
911    } else {
912        lower.trim()
913    };
914    let original_tail = if lower.starts_with("copyright") {
915        trimmed["copyright".len()..].trim()
916    } else {
917        trimmed
918    };
919
920    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
921        return false;
922    }
923
924    let alpha_tokens: Vec<&str> = tail
925        .split_whitespace()
926        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
927        .collect();
928
929    if alpha_tokens.len() <= 1 {
930        return has_explicit_copyright_marker(text)
931            && alpha_tokens.iter().any(|token| {
932                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
933            });
934    }
935
936    if !has_explicit_copyright_marker(text) {
937        return false;
938    }
939
940    has_binary_name_like_shape(original_tail)
941}
942
943fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
944    let mut authors = Vec::new();
945
946    for (line_index, line) in text_content.lines().enumerate() {
947        if let Some(author) = extract_named_author_from_binary_line(line) {
948            authors.push(AuthorDetection {
949                author,
950                start_line: LineNumber::from_0_indexed(line_index),
951                end_line: LineNumber::from_0_indexed(line_index),
952            });
953        }
954    }
955
956    authors
957}
958
959fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
960    let line = line.trim();
961    if line.is_empty() {
962        return None;
963    }
964
965    let emails = finder::find_emails(
966        line,
967        &DetectionConfig {
968            max_emails: 4,
969            max_urls: 0,
970            unique: false,
971        },
972    );
973    let email = emails.first()?.email.as_str();
974    if !is_binary_string_email_candidate(email) {
975        return None;
976    }
977
978    let lower_line = line.to_ascii_lowercase();
979    let email_start = lower_line.find(email)?;
980    let raw_prefix = &line[..email_start];
981    let has_author_marker = contains_binary_author_marker(raw_prefix);
982    let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
983    let prefix = prefix
984        .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
985        .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
986        .trim();
987
988    let (name, _) = split_name_email(prefix);
989    let name = name.or_else(|| {
990        let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
991        (!trimmed.is_empty()).then(|| trimmed.to_string())
992    });
993
994    let Some(name) = name.map(|name| name.trim().to_string()) else {
995        if has_author_marker {
996            return Some(email.to_string());
997        }
998        return None;
999    };
1000
1001    if name.is_empty() && has_author_marker {
1002        return Some(email.to_string());
1003    }
1004
1005    if !has_binary_name_like_shape(&name) {
1006        return None;
1007    }
1008
1009    if line.contains(&format!("<{email}>")) {
1010        Some(format!("{name} <{email}>"))
1011    } else if line.contains(&format!("({email})")) {
1012        Some(format!("{name} ({email})"))
1013    } else {
1014        Some(format!("{name} {email}"))
1015    }
1016}
1017
1018fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
1019    let lower = text.to_ascii_lowercase();
1020    let idx = lower.rfind(marker)?;
1021    Some(text[idx + marker.len()..].trim())
1022}
1023
1024fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
1025    const MARKERS: &[&str] = &[
1026        " patch author: ",
1027        " patch author ",
1028        " written by ",
1029        " contributed by ",
1030        " original work done by ",
1031        " work done by ",
1032        " thanks to ",
1033        " review by ",
1034        " by ",
1035        " from ",
1036    ];
1037
1038    MARKERS
1039        .iter()
1040        .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
1041        .next()
1042}
1043
1044fn contains_binary_author_marker(text: &str) -> bool {
1045    take_suffix_after_last_author_marker(text).is_some()
1046}
1047
1048fn has_binary_name_like_shape(text: &str) -> bool {
1049    let trimmed = text.trim();
1050    if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
1051    {
1052        return false;
1053    }
1054
1055    let tokens: Vec<&str> = trimmed
1056        .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
1057        .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
1058        .collect();
1059    if tokens.is_empty() {
1060        return false;
1061    }
1062
1063    let uppercase_like = tokens
1064        .iter()
1065        .filter(|token| {
1066            let token = token.trim_matches('.');
1067            token
1068                .chars()
1069                .find(|c| c.is_ascii_alphabetic())
1070                .is_some_and(|c| c.is_ascii_uppercase())
1071        })
1072        .count();
1073
1074    uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
1075        || tokens
1076            .iter()
1077            .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
1078}
1079
1080fn has_sufficient_alphabetic_content(text: &str) -> bool {
1081    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
1082    if alnum_count == 0 {
1083        return false;
1084    }
1085
1086    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
1087    alpha_count * 2 >= alnum_count
1088}
1089
1090fn has_excessive_at_noise(text: &str) -> bool {
1091    text.chars().filter(|c| *c == '@').count() >= 3
1092}
1093
1094fn has_explicit_copyright_marker(text: &str) -> bool {
1095    let lower = text.to_ascii_lowercase();
1096    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
1097}
1098
1099fn contains_year(text: &str) -> bool {
1100    let bytes = text.as_bytes();
1101    bytes.windows(4).any(|window| {
1102        window.iter().all(|b| b.is_ascii_digit())
1103            && matches!(window[0], b'1' | b'2')
1104            && matches!(window[1], b'9' | b'0')
1105    })
1106}
1107
1108fn is_company_like_suffix(token: &str) -> bool {
1109    matches!(
1110        token.to_ascii_lowercase().as_str(),
1111        "inc"
1112            | "corp"
1113            | "corporation"
1114            | "co"
1115            | "company"
1116            | "ltd"
1117            | "llc"
1118            | "gmbh"
1119            | "foundation"
1120            | "project"
1121            | "systems"
1122            | "software"
1123            | "technologies"
1124            | "technology"
1125    )
1126}
1127
1128fn extract_email_url_information(
1129    file_info_builder: &mut FileInfoBuilder,
1130    text_content: &str,
1131    text_options: &TextDetectionOptions,
1132    from_binary_strings: bool,
1133) {
1134    if !text_options.detect_emails && !text_options.detect_urls {
1135        return;
1136    }
1137
1138    if text_options.detect_emails {
1139        let config = DetectionConfig {
1140            max_emails: text_options.max_emails,
1141            max_urls: text_options.max_urls,
1142            unique: from_binary_strings,
1143        };
1144        let emails = finder::find_emails(text_content, &config)
1145            .into_iter()
1146            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
1147            .map(|d| OutputEmail {
1148                email: d.email,
1149                start_line: d.start_line,
1150                end_line: d.end_line,
1151            })
1152            .collect::<Vec<_>>();
1153        file_info_builder.emails(emails);
1154    }
1155
1156    if text_options.detect_urls {
1157        let config = DetectionConfig {
1158            max_emails: text_options.max_emails,
1159            max_urls: if from_binary_strings {
1160                0
1161            } else {
1162                text_options.max_urls
1163            },
1164            unique: !from_binary_strings,
1165        };
1166        let mut urls = finder::find_urls(text_content, &config)
1167            .into_iter()
1168            .filter_map(|d| {
1169                let url = if from_binary_strings {
1170                    normalize_binary_string_url(&d.url)?
1171                } else {
1172                    d.url
1173                };
1174                Some(OutputURL {
1175                    url,
1176                    start_line: d.start_line,
1177                    end_line: d.end_line,
1178                })
1179            })
1180            .collect::<Vec<_>>();
1181        if from_binary_strings {
1182            let mut seen = HashSet::new();
1183            urls.retain(|url| seen.insert(url.url.clone()));
1184            if text_options.max_urls > 0 && urls.len() > text_options.max_urls {
1185                urls.truncate(text_options.max_urls);
1186            }
1187        }
1188        file_info_builder.urls(urls);
1189    }
1190}
1191
1192fn is_binary_string_email_candidate(email: &str) -> bool {
1193    let Some((local, domain)) = email.rsplit_once('@') else {
1194        return false;
1195    };
1196
1197    if !has_strong_binary_local_part(local) {
1198        return false;
1199    }
1200
1201    has_strong_binary_host_shape(domain)
1202}
1203
1204fn is_binary_string_url_candidate(url: &str) -> bool {
1205    let parsed = url::Url::parse(url).ok();
1206    let Some(parsed) = parsed else {
1207        return false;
1208    };
1209    let Some(host) = parsed.host_str() else {
1210        return false;
1211    };
1212
1213    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
1214}
1215
1216fn normalize_binary_string_url(url: &str) -> Option<String> {
1217    let mut parsed = url::Url::parse(url).ok()?;
1218
1219    if let Some(host) = parsed.host_str() {
1220        let normalized_host = normalize_binary_url_host(host);
1221        if normalized_host != host {
1222            parsed.set_host(Some(&normalized_host)).ok()?;
1223        }
1224    }
1225
1226    let normalized_path = normalize_binary_url_path(parsed.path());
1227    if normalized_path != parsed.path() {
1228        parsed.set_path(&normalized_path);
1229    }
1230
1231    let normalized = parsed.to_string();
1232    is_binary_string_url_candidate(&normalized).then_some(normalized)
1233}
1234
1235fn normalize_binary_url_host(host: &str) -> String {
1236    let mut labels = host.split('.').map(ToOwned::to_owned).collect::<Vec<_>>();
1237    if let Some(last_label) = labels.last_mut() {
1238        *last_label = trim_binary_tld_tail(last_label);
1239    }
1240    labels.join(".")
1241}
1242
1243fn trim_binary_tld_tail(label: &str) -> String {
1244    const KNOWN_TLDS: &[&str] = &["com", "org", "net", "edu", "gov", "mil", "io", "dev"];
1245    for tld in KNOWN_TLDS {
1246        let Some(suffix) = label.get(tld.len()..) else {
1247            continue;
1248        };
1249        if label.len() > tld.len()
1250            && label[..tld.len()].eq_ignore_ascii_case(tld)
1251            && suffix.starts_with(|ch: char| ch.is_ascii_digit())
1252            && suffix.len() <= 3
1253            && suffix
1254                .chars()
1255                .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '!' | '$'))
1256        {
1257            return (*tld).to_string();
1258        }
1259    }
1260    label.to_string()
1261}
1262
1263fn normalize_binary_url_path(path: &str) -> String {
1264    let mut chars = path.chars().rev();
1265    let Some(last) = chars.next() else {
1266        return path.to_string();
1267    };
1268    let Some(prev) = chars.next() else {
1269        return path.to_string();
1270    };
1271    if matches!(last, '_' | '!' | '$') && prev.is_ascii_digit() {
1272        path[..path.len() - last.len_utf8()].to_string()
1273    } else {
1274        path.to_string()
1275    }
1276}
1277
1278fn is_binary_string_author_candidate(author: &str) -> bool {
1279    let trimmed = author.trim();
1280    if trimmed.is_empty()
1281        || !has_sufficient_alphabetic_content(trimmed)
1282        || has_excessive_at_noise(trimmed)
1283    {
1284        return false;
1285    }
1286
1287    if trimmed.contains('@') {
1288        let emails = finder::find_emails(
1289            trimmed,
1290            &DetectionConfig {
1291                max_emails: 4,
1292                max_urls: 0,
1293                unique: true,
1294            },
1295        );
1296        if emails.len() > 1 {
1297            return false;
1298        }
1299
1300        if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
1301            return !extracted.is_empty();
1302        }
1303
1304        let Some(email) = emails.first().map(|d| d.email.as_str()) else {
1305            return false;
1306        };
1307        if !is_binary_string_email_candidate(email) {
1308            return false;
1309        }
1310
1311        let (name, _) = split_name_email(trimmed);
1312        return name.as_deref().is_some_and(has_binary_name_like_shape);
1313    }
1314
1315    has_binary_name_like_shape(trimmed)
1316}
1317
1318fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
1319    if parsed.path() != "/"
1320        && parsed
1321            .path()
1322            .split('/')
1323            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
1324    {
1325        return true;
1326    }
1327
1328    if parsed.query().is_some() || parsed.fragment().is_some() {
1329        return true;
1330    }
1331
1332    let Some(host) = parsed.host_str() else {
1333        return false;
1334    };
1335
1336    let labels: Vec<&str> = host.split('.').collect();
1337    if labels.len() > 2 {
1338        return labels[..labels.len() - 1].iter().any(|label| {
1339            label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1340        });
1341    }
1342
1343    if matches!(labels.first(), Some(&"www")) {
1344        return true;
1345    }
1346
1347    if labels.len() == 2 {
1348        let domain = labels[0];
1349        let tld = labels[1];
1350        if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1351            return true;
1352        }
1353    }
1354
1355    labels
1356        .iter()
1357        .take(labels.len().saturating_sub(1))
1358        .any(|label| {
1359            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1360        })
1361}
1362
1363fn has_strong_binary_local_part(local: &str) -> bool {
1364    local
1365        .split(|c: char| !c.is_ascii_alphabetic())
1366        .any(|segment| segment.len() >= 3)
1367}
1368
1369fn has_strong_binary_host_shape(host: &str) -> bool {
1370    let labels: Vec<&str> = host.split('.').collect();
1371    if labels.len() < 2 {
1372        return false;
1373    }
1374
1375    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1376        &labels[1..]
1377    } else {
1378        &labels[..]
1379    };
1380
1381    if relevant.len() < 2 {
1382        return false;
1383    }
1384
1385    relevant[..relevant.len() - 1].iter().any(|label| {
1386        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1387    })
1388}
1389
1390fn extract_license_information(
1391    file_info_builder: &mut FileInfoBuilder,
1392    scan_errors: &mut Vec<String>,
1393    path: &Path,
1394    text_content: String,
1395    license_engine: Option<Arc<LicenseDetectionEngine>>,
1396    license_options: LicenseScanOptions,
1397    from_binary_strings: bool,
1398) -> Result<(), Error> {
1399    let Some(engine) = license_engine else {
1400        return Ok(());
1401    };
1402
1403    let detection_result = if license_options.min_score == 0 {
1404        engine.detect_with_kind_and_source(
1405            &text_content,
1406            license_options.unknown_licenses,
1407            from_binary_strings,
1408            &path.to_string_lossy(),
1409        )
1410    } else {
1411        engine.detect_with_kind_and_source_with_score(
1412            &text_content,
1413            license_options.unknown_licenses,
1414            from_binary_strings,
1415            &path.to_string_lossy(),
1416            f32::from(license_options.min_score),
1417        )
1418    };
1419
1420    match detection_result {
1421        Ok(detections) => {
1422            let query =
1423                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1424            let mut model_detections = Vec::new();
1425            let mut model_clues = Vec::new();
1426
1427            for detection in &detections {
1428                let (public_detection, clue_matches) = convert_detection_to_model(
1429                    detection,
1430                    license_options,
1431                    &text_content,
1432                    query.as_ref(),
1433                );
1434
1435                if let Some(public_detection) = public_detection {
1436                    model_detections.push(public_detection);
1437                }
1438
1439                model_clues.extend(clue_matches);
1440            }
1441
1442            if !model_detections.is_empty() {
1443                let expressions: Vec<String> = model_detections
1444                    .iter()
1445                    .filter(|d| !d.license_expression_spdx.is_empty())
1446                    .map(|d| d.license_expression_spdx.clone())
1447                    .collect();
1448
1449                if !expressions.is_empty() {
1450                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
1451                    if let Some(expr) = combined {
1452                        file_info_builder.license_expression(Some(expr));
1453                    }
1454                }
1455            }
1456
1457            file_info_builder.license_detections(model_detections);
1458            file_info_builder.license_clues(model_clues);
1459            file_info_builder.percentage_of_license_text(
1460                query
1461                    .as_ref()
1462                    .map(|query| compute_percentage_of_license_text(query, &detections)),
1463            );
1464        }
1465        Err(e) => {
1466            scan_errors.push(format!("License detection failed: {}", e));
1467        }
1468    }
1469
1470    Ok(())
1471}
1472
1473fn convert_detection_to_model(
1474    detection: &crate::license_detection::LicenseDetection,
1475    license_options: LicenseScanOptions,
1476    text_content: &str,
1477    query: Option<&Query<'_>>,
1478) -> (Option<LicenseDetection>, Vec<Match>) {
1479    let matches: Vec<Match> = detection
1480        .matches
1481        .iter()
1482        .map(|m| convert_match_to_model(m, license_options, text_content, query))
1483        .collect();
1484
1485    if let Some(license_expression) = detection.license_expression.clone() {
1486        (
1487            Some(LicenseDetection {
1488                license_expression,
1489                license_expression_spdx: detection
1490                    .license_expression_spdx
1491                    .clone()
1492                    .unwrap_or_default(),
1493                matches,
1494                detection_log: if license_options.include_diagnostics {
1495                    detection.detection_log.clone()
1496                } else {
1497                    Vec::new()
1498                },
1499                identifier: detection.identifier.clone(),
1500            }),
1501            Vec::new(),
1502        )
1503    } else {
1504        (None, matches)
1505    }
1506}
1507
1508fn convert_match_to_model(
1509    m: &crate::license_detection::models::LicenseMatch,
1510    license_options: LicenseScanOptions,
1511    text_content: &str,
1512    query: Option<&Query<'_>>,
1513) -> Match {
1514    let rule_url = if m.rule_url.is_empty() {
1515        None
1516    } else {
1517        Some(m.rule_url.clone())
1518    };
1519    let matched_text = if license_options.include_text {
1520        m.matched_text.clone().or_else(|| {
1521            Some(crate::license_detection::query::matched_text_from_text(
1522                text_content,
1523                m.start_line.get(),
1524                m.end_line.get(),
1525            ))
1526        })
1527    } else {
1528        None
1529    };
1530    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1531        query.map(|query| matched_text_diagnostics_from_match(query, m))
1532    } else {
1533        None
1534    };
1535    Match {
1536        license_expression: m.license_expression.clone(),
1537        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1538        from_file: m.from_file.clone(),
1539        start_line: m.start_line,
1540        end_line: m.end_line,
1541        matcher: Some(m.matcher.to_string()),
1542        score: m.score,
1543        matched_length: Some(m.matched_length),
1544        match_coverage: Some((f64::from(m.coverage()) * 100.0).round() / 100.0),
1545        rule_relevance: Some(m.rule_relevance),
1546        rule_identifier: Some(m.rule_identifier.clone()),
1547        rule_url,
1548        matched_text,
1549        referenced_filenames: m.referenced_filenames.clone(),
1550        matched_text_diagnostics,
1551    }
1552}
1553
1554fn compute_percentage_of_license_text(
1555    query: &Query<'_>,
1556    detections: &[crate::license_detection::LicenseDetection],
1557) -> f64 {
1558    let matched_positions: std::collections::HashSet<usize> = detections
1559        .iter()
1560        .flat_map(|detection| detection.matches.iter())
1561        .flat_map(|m| m.query_span().iter())
1562        .collect();
1563
1564    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1565    if query_tokens_length == 0 {
1566        return 0.0;
1567    }
1568
1569    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1570    (percentage * 100.0).round() / 100.0
1571}
1572
1573fn matched_text_diagnostics_from_match(
1574    query: &Query<'_>,
1575    license_match: &InternalLicenseMatch,
1576) -> String {
1577    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1578    let Some(start_pos) = matched_positions.iter().min() else {
1579        return crate::license_detection::query::matched_text_from_text(
1580            &query.text,
1581            license_match.start_line.get(),
1582            license_match.end_line.get(),
1583        );
1584    };
1585    let Some(end_pos) = matched_positions.iter().max() else {
1586        return crate::license_detection::query::matched_text_from_text(
1587            &query.text,
1588            license_match.start_line.get(),
1589            license_match.end_line.get(),
1590        );
1591    };
1592
1593    crate::license_detection::query::matched_text_diagnostics_from_text(
1594        &query.text,
1595        query,
1596        &matched_positions,
1597        start_pos,
1598        end_pos,
1599        license_match.start_line.get(),
1600        license_match.end_line.get(),
1601    )
1602}
1603
1604fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1605    is_pem_certificate_file(path, buffer)
1606}
1607
1608fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1609    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1610        return Ok(false);
1611    }
1612
1613    if path
1614        .file_name()
1615        .and_then(|name| name.to_str())
1616        .is_some_and(|name| name.ends_with("_test.go"))
1617    {
1618        return Ok(true);
1619    }
1620
1621    let content = fs::read_to_string(path)?;
1622    Ok(content.lines().take(10).any(|line| {
1623        let trimmed = line.trim();
1624        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1625            && trimmed.split_whitespace().any(|token| token == "test")
1626    }))
1627}
1628
1629fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1630    let prefix_len = buffer.len().min(8192);
1631    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1632    let trimmed_lines: Vec<&str> = prefix
1633        .lines()
1634        .map(str::trim)
1635        .filter(|line| !line.is_empty())
1636        .take(64)
1637        .collect();
1638
1639    let Some(first_line) = trimmed_lines.first().copied() else {
1640        return false;
1641    };
1642
1643    PEM_CERTIFICATE_HEADERS
1644        .iter()
1645        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1646}
1647
1648fn process_directory(
1649    path: &Path,
1650    _metadata: &fs::Metadata,
1651    collect_info: bool,
1652    license_enabled: bool,
1653) -> FileInfo {
1654    let name = path
1655        .file_name()
1656        .unwrap_or_default()
1657        .to_string_lossy()
1658        .to_string();
1659    let base_name = name.clone(); // For directories, base_name is the same as name
1660
1661    FileInfo {
1662        name,
1663        base_name,
1664        extension: "".to_string(),
1665        path: path.to_string_lossy().to_string(),
1666        file_type: FileType::Directory,
1667        mime_type: None,
1668        file_type_label: None,
1669        size: 0,
1670        date: None,
1671        sha1: None,
1672        md5: None,
1673        sha256: None,
1674        sha1_git: None,
1675        programming_language: None,
1676        package_data: Vec::new(),
1677        license_expression: None,
1678        license_detections: Vec::new(),
1679        license_clues: Vec::new(),
1680        percentage_of_license_text: license_enabled.then_some(0.0),
1681        copyrights: Vec::new(),
1682        holders: Vec::new(),
1683        authors: Vec::new(),
1684        emails: Vec::new(),
1685        urls: Vec::new(),
1686        for_packages: Vec::new(),
1687        scan_errors: Vec::new(),
1688        license_policy: None,
1689        is_binary: collect_info.then_some(false),
1690        is_text: collect_info.then_some(false),
1691        is_archive: collect_info.then_some(false),
1692        is_media: collect_info.then_some(false),
1693        is_source: collect_info.then_some(false),
1694        is_script: collect_info.then_some(false),
1695        files_count: collect_info.then_some(0),
1696        dirs_count: collect_info.then_some(0),
1697        size_count: collect_info.then_some(0),
1698        source_count: None,
1699        is_legal: false,
1700        is_manifest: false,
1701        is_readme: false,
1702        is_top_level: false,
1703        is_key_file: false,
1704        is_community: false,
1705        is_generated: None,
1706        facets: vec![],
1707        tallies: None,
1708    }
1709}
1710
1711#[cfg(test)]
1712mod tests {
1713    use super::{
1714        LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES, cap_non_source_json_license_text,
1715        compute_percentage_of_license_text, convert_detection_to_model,
1716        extract_email_url_information, extract_named_author_from_binary_line,
1717        is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1718        is_binary_string_email_candidate, is_binary_string_url_candidate,
1719        is_go_non_production_source, normalize_binary_string_url, process_file,
1720    };
1721    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1722    use crate::license_detection::index::LicenseIndex;
1723    use crate::license_detection::index::dictionary::TokenDictionary;
1724    use crate::license_detection::models::position_span::PositionSpan;
1725    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1726    use crate::license_detection::query::Query;
1727    use crate::models::{FileInfoBuilder, FileType, MatchScore};
1728    use crate::progress::{ProgressMode, ScanProgress};
1729    use crate::scanner::scan_options_fingerprint;
1730    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1731    use crate::utils::file::FileInfoClassification;
1732    use std::fs;
1733    use std::path::Path;
1734    use std::time::{Duration, Instant};
1735    use tempfile::tempdir;
1736
1737    use super::maybe_record_processing_timeout;
1738
1739    use crate::models::LineNumber;
1740
1741    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1742        LicenseMatch {
1743            rid: 0,
1744            license_expression: "mit".to_string(),
1745            license_expression_spdx: Some("MIT".to_string()),
1746            from_file: None,
1747            start_line: LineNumber::ONE,
1748            end_line: LineNumber::ONE,
1749            start_token: 0,
1750            end_token: 1,
1751            matcher: MatcherKind::Hash,
1752            score: MatchScore::from_percentage(1.0),
1753            matched_length: 3,
1754            rule_length: 3,
1755            match_coverage: 100.0,
1756            rule_relevance: 100,
1757            rule_identifier: "mit.LICENSE".to_string(),
1758            rule_url: rule_url.to_string(),
1759            matched_text: Some("MIT".to_string()),
1760            referenced_filenames: None,
1761            rule_kind: RuleKind::Text,
1762            is_from_license: true,
1763            rule_start_token: 0,
1764            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1765            candidate_resemblance: 0.0,
1766            candidate_containment: 0.0,
1767        }
1768    }
1769
1770    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1771        InternalLicenseDetection {
1772            license_expression: Some("mit".to_string()),
1773            license_expression_spdx: Some("MIT".to_string()),
1774            matches: vec![make_internal_match(rule_url)],
1775            detection_log: vec![],
1776            identifier: Some("mit-test".to_string()),
1777            file_regions: Vec::new(),
1778        }
1779    }
1780
1781    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1782        let dictionary = TokenDictionary::new_with_legalese(entries);
1783        let mut index = LicenseIndex::new(dictionary);
1784        index.len_legalese = len_legalese;
1785        index
1786    }
1787
1788    #[test]
1789    fn test_convert_detection_to_model_preserves_rule_url() {
1790        let detection = make_detection(
1791            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1792        );
1793
1794        let (converted, clues) =
1795            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1796        let converted = converted.expect("detection should convert");
1797
1798        assert_eq!(
1799            converted.matches[0].rule_url.as_deref(),
1800            Some(
1801                "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1802            )
1803        );
1804        assert!(clues.is_empty());
1805    }
1806
1807    #[test]
1808    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1809        let detection = make_detection("");
1810
1811        let (converted, clues) =
1812            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1813        let converted = converted.expect("detection should convert");
1814
1815        assert_eq!(converted.matches[0].rule_url, None);
1816        assert!(clues.is_empty());
1817    }
1818
1819    #[test]
1820    fn test_convert_detection_to_model_rounds_match_coverage() {
1821        let mut detection = make_detection("");
1822        detection.matches[0].score = MatchScore::from_percentage(81.82);
1823        detection.matches[0].match_coverage = 33.334;
1824
1825        let (converted, clues) =
1826            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1827        let converted = converted.expect("detection should convert");
1828
1829        assert_eq!(
1830            converted.matches[0].score,
1831            MatchScore::from_percentage(81.82)
1832        );
1833        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1834        assert!(clues.is_empty());
1835    }
1836
1837    #[test]
1838    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1839        let mut detection = make_detection(
1840            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1841        );
1842        detection.license_expression = None;
1843        detection.license_expression_spdx = None;
1844        detection.identifier = None;
1845        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1846        detection.matches[0].license_expression_spdx =
1847            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1848        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1849        detection.matches[0].rule_kind = RuleKind::Clue;
1850
1851        let (converted, clues) = convert_detection_to_model(
1852            &detection,
1853            LicenseScanOptions {
1854                include_text: true,
1855                min_score: 0,
1856                ..LicenseScanOptions::default()
1857            },
1858            "clue text",
1859            None,
1860        );
1861
1862        assert!(converted.is_none());
1863        assert_eq!(clues.len(), 1);
1864        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1865        assert_eq!(
1866            clues[0].license_expression_spdx,
1867            "LicenseRef-scancode-unknown-license-reference"
1868        );
1869        assert_eq!(
1870            clues[0].rule_identifier.as_deref(),
1871            Some("license-clue_1.RULE")
1872        );
1873        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1874        assert_eq!(clues[0].matched_text_diagnostics, None);
1875    }
1876
1877    #[test]
1878    fn test_process_file_suppresses_non_actionable_pdf_extraction_failure() {
1879        let dir = tempdir().expect("tempdir");
1880        let path = dir.path().join("broken.pdf");
1881        fs::write(&path, b"%PDF-1.7\nthis is not a valid pdf object graph\n")
1882            .expect("write malformed pdf");
1883        let metadata = fs::metadata(&path).expect("metadata");
1884        let progress = ScanProgress::new(ProgressMode::Quiet);
1885
1886        let file_info = process_file(
1887            &path,
1888            &metadata,
1889            &progress,
1890            None,
1891            LicenseScanOptions::default(),
1892            &TextDetectionOptions::default(),
1893        );
1894
1895        assert!(file_info.scan_errors.is_empty());
1896    }
1897
1898    #[test]
1899    fn test_processing_timeout_is_not_duplicated_after_stage_specific_timeout() {
1900        let started = Instant::now() - Duration::from_secs(2);
1901        let mut scan_errors = vec!["Timeout before license scan (> 1.00s)".to_string()];
1902
1903        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1904
1905        assert_eq!(scan_errors, vec!["Timeout before license scan (> 1.00s)"]);
1906    }
1907
1908    #[test]
1909    fn test_processing_timeout_is_recorded_when_no_timeout_error_exists() {
1910        let started = Instant::now() - Duration::from_secs(2);
1911        let mut scan_errors = Vec::new();
1912
1913        maybe_record_processing_timeout(&mut scan_errors, started, 1.0);
1914
1915        assert_eq!(
1916            scan_errors,
1917            vec!["Processing interrupted due to timeout after 1.00 seconds"]
1918        );
1919    }
1920
1921    #[test]
1922    fn test_cap_non_source_json_license_text_truncates_large_json() {
1923        let classification = FileInfoClassification {
1924            mime_type: "application/json".to_string(),
1925            file_type: "JSON text data".to_string(),
1926            programming_language: None,
1927            is_binary: false,
1928            is_text: true,
1929            is_archive: false,
1930            is_media: false,
1931            is_source: false,
1932            is_script: false,
1933        };
1934        let large_json = format!("{{\"items\":\"{}\"}}", "x".repeat(200_000));
1935
1936        let capped = cap_non_source_json_license_text(
1937            Path::new("resolution.json"),
1938            &classification,
1939            &large_json,
1940        );
1941
1942        assert!(capped.len() <= LARGE_NON_SOURCE_JSON_LICENSE_TEXT_BYTES);
1943        assert!(capped.len() < large_json.len());
1944    }
1945
1946    #[test]
1947    fn test_cap_non_source_json_license_text_keeps_sourcemaps_intact() {
1948        let classification = FileInfoClassification {
1949            mime_type: "application/json".to_string(),
1950            file_type: "JSON text data".to_string(),
1951            programming_language: None,
1952            is_binary: false,
1953            is_text: true,
1954            is_archive: false,
1955            is_media: false,
1956            is_source: false,
1957            is_script: false,
1958        };
1959        let large_json = format!("{{\"mappings\":\"{}\"}}", "x".repeat(200_000));
1960
1961        let capped = cap_non_source_json_license_text(
1962            Path::new("bundle.js.map"),
1963            &classification,
1964            &large_json,
1965        );
1966
1967        assert_eq!(capped.as_ref(), large_json);
1968    }
1969
1970    #[test]
1971    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1972        let text = concat!(
1973            "Reproduction and distribution of this file, with or without modification, are\n",
1974            "permitted in any medium without royalties provided the copyright notice\n",
1975            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1976        );
1977        let index = create_test_index(
1978            &[
1979                ("reproduction", 0),
1980                ("distribution", 1),
1981                ("file", 2),
1982                ("without", 3),
1983                ("modification", 4),
1984                ("permitted", 5),
1985                ("medium", 6),
1986                ("royalties", 7),
1987                ("provided", 8),
1988                ("copyright", 9),
1989                ("notice", 10),
1990                ("preserved", 11),
1991                ("offered", 12),
1992                ("warranties", 13),
1993            ],
1994            14,
1995        );
1996        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1997        let mut detection = make_detection(
1998            "https://github.com/aboutcode-org/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1999        );
2000        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
2001        detection.matches[0].license_expression = "fsf-ap".to_string();
2002        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
2003        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
2004        detection.matches[0].matched_text = None;
2005        detection.matches[0].start_line = LineNumber::ONE;
2006        detection.matches[0].end_line = LineNumber::new(3).unwrap();
2007        detection.matches[0].start_token = 0;
2008        detection.matches[0].end_token = query.tokens.len();
2009        detection.matches[0].coordinates =
2010            MatchCoordinates::query_region(PositionSpan::from_positions(
2011                query
2012                    .tokens
2013                    .iter()
2014                    .enumerate()
2015                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
2016                    .collect::<Vec<_>>(),
2017            ));
2018        detection.identifier = Some("fsf_ap-test".to_string());
2019
2020        let (converted, clues) = convert_detection_to_model(
2021            &detection,
2022            LicenseScanOptions {
2023                include_text: true,
2024                include_text_diagnostics: true,
2025                include_diagnostics: true,
2026                unknown_licenses: false,
2027                min_score: 0,
2028            },
2029            text,
2030            Some(&query),
2031        );
2032        let converted = converted.expect("detection should convert");
2033
2034        assert!(clues.is_empty());
2035        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
2036        assert_eq!(
2037            converted.matches[0].matched_text.as_deref(),
2038            Some(text.trim_end())
2039        );
2040        let diagnostics = converted.matches[0]
2041            .matched_text_diagnostics
2042            .as_deref()
2043            .expect("diagnostics should be present");
2044        assert!(diagnostics.contains('['));
2045        assert!(diagnostics.contains(']'));
2046        assert_ne!(diagnostics, text.trim_end());
2047    }
2048
2049    #[test]
2050    fn test_extract_email_url_information_skips_binary_string_text() {
2051        let mut builder = FileInfoBuilder::default();
2052        let options = TextDetectionOptions {
2053            collect_info: false,
2054            detect_packages: false,
2055            detect_application_packages: false,
2056            detect_system_packages: false,
2057            detect_packages_in_compiled: false,
2058            detect_copyrights: false,
2059            detect_generated: false,
2060            detect_emails: true,
2061            detect_urls: true,
2062            max_emails: 50,
2063            max_urls: 50,
2064            timeout_seconds: 120.0,
2065        };
2066
2067        extract_email_url_information(
2068            &mut builder,
2069            "contact 6h@fo.lwft and visit http://gmail.com/",
2070            &options,
2071            true,
2072        );
2073
2074        let file = builder
2075            .name("binary.bin".to_string())
2076            .base_name("binary".to_string())
2077            .extension(".bin".to_string())
2078            .path("binary.bin".to_string())
2079            .file_type(FileType::File)
2080            .size(1)
2081            .build()
2082            .expect("builder should produce file info");
2083
2084        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
2085        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
2086    }
2087
2088    #[test]
2089    fn test_extract_email_url_information_keeps_good_binary_contacts() {
2090        let mut builder = FileInfoBuilder::default();
2091        let options = TextDetectionOptions {
2092            collect_info: false,
2093            detect_packages: false,
2094            detect_application_packages: false,
2095            detect_system_packages: false,
2096            detect_packages_in_compiled: false,
2097            detect_copyrights: false,
2098            detect_generated: false,
2099            detect_emails: true,
2100            detect_urls: true,
2101            max_emails: 50,
2102            max_urls: 50,
2103            timeout_seconds: 120.0,
2104        };
2105
2106        extract_email_url_information(
2107            &mut builder,
2108            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
2109            &options,
2110            true,
2111        );
2112
2113        let file = builder
2114            .name("binary.bin".to_string())
2115            .base_name("binary".to_string())
2116            .extension(".bin".to_string())
2117            .path("binary.bin".to_string())
2118            .file_type(FileType::File)
2119            .size(1)
2120            .build()
2121            .expect("builder should produce file info");
2122
2123        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
2124        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
2125        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
2126        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
2127    }
2128
2129    #[test]
2130    fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
2131        let mut builder = FileInfoBuilder::default();
2132        let options = TextDetectionOptions {
2133            collect_info: false,
2134            detect_packages: false,
2135            detect_application_packages: false,
2136            detect_system_packages: false,
2137            detect_packages_in_compiled: false,
2138            detect_copyrights: false,
2139            detect_generated: false,
2140            detect_emails: true,
2141            detect_urls: false,
2142            max_emails: 2,
2143            max_urls: 50,
2144            timeout_seconds: 120.0,
2145        };
2146
2147        extract_email_url_information(
2148            &mut builder,
2149            "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
2150            &options,
2151            true,
2152        );
2153
2154        let file = builder
2155            .name("binary.bin".to_string())
2156            .base_name("binary".to_string())
2157            .extension(".bin".to_string())
2158            .path("binary.bin".to_string())
2159            .file_type(FileType::File)
2160            .size(1)
2161            .build()
2162            .expect("builder should produce file info");
2163
2164        assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
2165        assert_eq!(file.emails[0].email, "jakub@redhat.com");
2166        assert_eq!(file.emails[1].email, "contyk@redhat.com");
2167    }
2168
2169    #[test]
2170    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
2171        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
2172        assert!(!is_binary_string_copyright_candidate(gibberish));
2173    }
2174
2175    #[test]
2176    fn test_binary_string_copyright_candidate_keeps_real_notice() {
2177        let notice = "Copyright nexB and others (c) 2012";
2178        assert!(is_binary_string_copyright_candidate(notice));
2179    }
2180
2181    #[test]
2182    fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
2183        assert!(!is_binary_string_copyright_candidate(
2184            "Copyright - split out libs"
2185        ));
2186    }
2187
2188    #[test]
2189    fn test_binary_string_email_candidate_rejects_gibberish() {
2190        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
2191    }
2192
2193    #[test]
2194    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
2195        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
2196    }
2197
2198    #[test]
2199    fn test_binary_string_url_candidate_rejects_short_fake_host() {
2200        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
2201    }
2202
2203    #[test]
2204    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
2205        assert!(is_binary_string_url_candidate(
2206            "https://www.gnu.org/software/coreutils/"
2207        ));
2208    }
2209
2210    #[test]
2211    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
2212        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
2213    }
2214
2215    #[test]
2216    fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
2217        assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
2218    }
2219
2220    #[test]
2221    fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
2222        assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
2223    }
2224
2225    #[test]
2226    fn test_binary_string_url_candidate_keeps_short_project_path() {
2227        assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
2228    }
2229
2230    #[test]
2231    fn test_normalize_binary_string_url_trims_certificate_host_tail_noise() {
2232        assert_eq!(
2233            normalize_binary_string_url("http://ocsp.digicert.com0/"),
2234            Some("http://ocsp.digicert.com/".to_string())
2235        );
2236        assert_eq!(
2237            normalize_binary_string_url("http://www.digicert.com1!0/"),
2238            Some("http://www.digicert.com/".to_string())
2239        );
2240    }
2241
2242    #[test]
2243    fn test_normalize_binary_string_url_trims_trailing_path_noise() {
2244        assert_eq!(
2245            normalize_binary_string_url(
2246                "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0_"
2247            ),
2248            Some(
2249                "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0".to_string()
2250            )
2251        );
2252    }
2253
2254    #[test]
2255    fn test_normalize_binary_string_url_preserves_clean_certificate_urls() {
2256        assert_eq!(
2257            normalize_binary_string_url("http://ocsp.digicert.com/"),
2258            Some("http://ocsp.digicert.com/".to_string())
2259        );
2260        assert_eq!(
2261            normalize_binary_string_url(
2262                "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0"
2263            ),
2264            Some(
2265                "http://cacerts.digicert.com/DigiCertTrustedG4TimeStampingRSA4096SHA2562025CA1.crt0".to_string()
2266            )
2267        );
2268    }
2269
2270    #[test]
2271    fn test_normalize_binary_string_url_does_not_trim_long_host_suffixes() {
2272        assert_eq!(
2273            normalize_binary_string_url("http://example.com0evil/"),
2274            None
2275        );
2276    }
2277
2278    #[test]
2279    fn test_normalize_binary_string_url_does_not_trim_legitimate_path_suffix() {
2280        assert_eq!(
2281            normalize_binary_string_url("http://example.com/path_/"),
2282            Some("http://example.com/path_/".to_string())
2283        );
2284    }
2285
2286    #[test]
2287    fn test_extract_email_url_information_caps_after_binary_normalization() {
2288        let mut builder = FileInfoBuilder::default();
2289        let text = [
2290            "http://ocsp.digicert.com0/",
2291            "http://ocsp.digicert.com0a/",
2292            "http://www.digicert.com1!0/",
2293        ]
2294        .join("\n");
2295        let options = TextDetectionOptions {
2296            detect_urls: true,
2297            max_urls: 2,
2298            ..TextDetectionOptions::default()
2299        };
2300
2301        extract_email_url_information(&mut builder, &text, &options, true);
2302        let file_info = builder
2303            .name("binary.txt".to_string())
2304            .base_name("binary".to_string())
2305            .extension(".txt".to_string())
2306            .path("binary.txt".to_string())
2307            .file_type(FileType::File)
2308            .size(0)
2309            .build()
2310            .expect("file info");
2311
2312        assert_eq!(file_info.urls.len(), 2);
2313        assert_eq!(file_info.urls[0].url, "http://ocsp.digicert.com/");
2314        assert_eq!(file_info.urls[1].url, "http://www.digicert.com/");
2315    }
2316
2317    #[test]
2318    fn test_binary_string_author_candidate_keeps_named_author_with_email() {
2319        assert!(is_binary_string_author_candidate(
2320            "Andreas Schneider <asn@redhat.com>"
2321        ));
2322    }
2323
2324    #[test]
2325    fn test_binary_string_author_candidate_rejects_gibberish() {
2326        assert!(!is_binary_string_author_candidate(
2327            "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
2328        ));
2329    }
2330
2331    #[test]
2332    fn test_binary_string_author_candidate_rejects_changelog_phrase() {
2333        assert!(!is_binary_string_author_candidate(
2334            "Developers can enable them. - revert news user back to"
2335        ));
2336    }
2337
2338    #[test]
2339    fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
2340        assert_eq!(
2341            extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
2342            Some("Andreas Schneider <asn@redhat.com>".to_string())
2343        );
2344    }
2345
2346    #[test]
2347    fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
2348        assert_eq!(
2349            extract_named_author_from_binary_line(
2350                "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
2351            ),
2352            Some("Rob Crittenden (rcritten@redhat.com)".to_string())
2353        );
2354    }
2355
2356    #[test]
2357    fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
2358        assert_eq!(
2359            extract_named_author_from_binary_line(
2360                "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
2361            ),
2362            None
2363        );
2364    }
2365
2366    #[test]
2367    fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
2368        assert_eq!(
2369            extract_named_author_from_binary_line(
2370                "Changes as per initial review by panemade@gmail.com"
2371            ),
2372            Some("panemade@gmail.com".to_string())
2373        );
2374    }
2375
2376    #[test]
2377    fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
2378        assert!(!is_binary_string_author_candidate(
2379            "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
2380        ));
2381    }
2382
2383    #[test]
2384    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
2385        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
2386        let text = "alpha MIT omega";
2387        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
2388        let mut detection = make_detection("");
2389        detection.matches[0].coordinates =
2390            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
2391        detection.matches[0].start_token = 1;
2392        detection.matches[0].end_token = 2;
2393
2394        let percentage = compute_percentage_of_license_text(&query, &[detection]);
2395
2396        assert_eq!(percentage, 33.33);
2397    }
2398
2399    #[test]
2400    fn test_scan_options_fingerprint_changes_with_license_score() {
2401        let text_options = crate::scanner::TextDetectionOptions::default();
2402        let default_fingerprint = scan_options_fingerprint(
2403            &text_options,
2404            LicenseScanOptions {
2405                min_score: 0,
2406                ..LicenseScanOptions::default()
2407            },
2408            None,
2409        );
2410        let filtered_fingerprint = scan_options_fingerprint(
2411            &text_options,
2412            LicenseScanOptions {
2413                min_score: 70,
2414                ..LicenseScanOptions::default()
2415            },
2416            None,
2417        );
2418
2419        assert_ne!(default_fingerprint, filtered_fingerprint);
2420    }
2421
2422    #[test]
2423    fn test_is_go_non_production_source_for_test_filename() {
2424        let temp_dir = tempdir().unwrap();
2425        let path = temp_dir.path().join("scanner_test.go");
2426        fs::write(&path, "package scanner\n").unwrap();
2427
2428        assert!(is_go_non_production_source(&path).unwrap());
2429    }
2430
2431    #[test]
2432    fn test_is_go_non_production_source_for_build_tag() {
2433        let temp_dir = tempdir().unwrap();
2434        let path = temp_dir.path().join("scanner.go");
2435        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
2436
2437        assert!(is_go_non_production_source(&path).unwrap());
2438    }
2439
2440    #[test]
2441    fn test_is_go_non_production_source_for_regular_go_file() {
2442        let temp_dir = tempdir().unwrap();
2443        let path = temp_dir.path().join("scanner.go");
2444        fs::write(&path, "package scanner\n").unwrap();
2445
2446        assert!(!is_go_non_production_source(&path).unwrap());
2447    }
2448}
provenant/scanner/process.rs

provenant/scanner/
process.rs