Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::fs::{self, File};
10use std::io::{Read, Write};
11use std::path::Path;
12use std::sync::Arc;
13use std::time::{Duration, Instant};
14
15use crate::copyright::{
16    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
17};
18use crate::finder::{self, DetectionConfig};
19use crate::license_detection::PositionSet;
20use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
21use crate::license_detection::query::Query;
22use crate::models::{
23    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
24    Match, OutputEmail, OutputURL,
25};
26use crate::progress::ScanProgress;
27use crate::scanner::collect::CollectedPaths;
28use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
29use crate::utils::file::{
30    ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
31};
32use crate::utils::generated::generated_code_hints_from_bytes;
33use tempfile::TempDir;
34
35const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
36    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
37    (
38        "-----BEGIN TRUSTED CERTIFICATE-----",
39        "-----END TRUSTED CERTIFICATE-----",
40    ),
41];
42
43pub fn process_collected(
44    collected: &CollectedPaths,
45    progress: Arc<ScanProgress>,
46    license_engine: Option<Arc<LicenseDetectionEngine>>,
47    license_options: LicenseScanOptions,
48    text_options: &TextDetectionOptions,
49) -> ProcessResult {
50    let mut all_files: Vec<FileInfo> = collected
51        .files
52        .par_iter()
53        .map(|(path, metadata)| {
54            let file_entry = process_file(
55                path,
56                metadata,
57                progress.as_ref(),
58                license_engine.clone(),
59                license_options,
60                text_options,
61            );
62            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
63            file_entry
64        })
65        .collect();
66
67    for (path, metadata) in &collected.directories {
68        all_files.push(process_directory(
69            path,
70            metadata,
71            text_options.collect_info,
72            license_engine.is_some(),
73        ));
74    }
75
76    ProcessResult {
77        files: all_files,
78        excluded_count: collected.excluded_count,
79    }
80}
81
82pub fn process_collected_with_memory_limit(
83    collected: &CollectedPaths,
84    progress: Arc<ScanProgress>,
85    license_engine: Option<Arc<LicenseDetectionEngine>>,
86    license_options: LicenseScanOptions,
87    text_options: &TextDetectionOptions,
88    max_in_memory: i64,
89) -> ProcessResult {
90    if max_in_memory == 0 {
91        return process_collected(
92            collected,
93            progress,
94            license_engine,
95            license_options,
96            text_options,
97        );
98    }
99
100    let memory_limit = if max_in_memory < 0 {
101        0
102    } else {
103        max_in_memory as usize
104    };
105    let chunk_size = if max_in_memory < 0 {
106        256
107    } else {
108        memory_limit.max(1)
109    };
110
111    let mut retained_files = Vec::new();
112    let mut spill_store = None;
113
114    for chunk in collected.files.chunks(chunk_size) {
115        let processed_chunk: Vec<FileInfo> = chunk
116            .par_iter()
117            .map(|(path, metadata)| {
118                let file_entry = process_file(
119                    path,
120                    metadata,
121                    progress.as_ref(),
122                    license_engine.clone(),
123                    license_options,
124                    text_options,
125                );
126                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
127                file_entry
128            })
129            .collect();
130
131        retain_or_spill_chunk(
132            processed_chunk,
133            &mut retained_files,
134            &mut spill_store,
135            memory_limit,
136        );
137    }
138
139    for (path, metadata) in &collected.directories {
140        let entry = process_directory(
141            path,
142            metadata,
143            text_options.collect_info,
144            license_engine.is_some(),
145        );
146        retain_or_spill_chunk(
147            vec![entry],
148            &mut retained_files,
149            &mut spill_store,
150            memory_limit,
151        );
152    }
153
154    if let Some(spill_store) = spill_store {
155        retained_files.extend(spill_store.load_all());
156    }
157
158    ProcessResult {
159        files: retained_files,
160        excluded_count: collected.excluded_count,
161    }
162}
163
164fn retain_or_spill_chunk(
165    chunk: Vec<FileInfo>,
166    retained_files: &mut Vec<FileInfo>,
167    spill_store: &mut Option<FileInfoSpillStore>,
168    memory_limit: usize,
169) {
170    if memory_limit == 0 {
171        spill_store
172            .get_or_insert_with(FileInfoSpillStore::new)
173            .spill(chunk);
174        return;
175    }
176
177    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
178    if remaining_capacity >= chunk.len() && spill_store.is_none() {
179        retained_files.extend(chunk);
180        return;
181    }
182
183    let mut chunk_iter = chunk.into_iter();
184    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
185    let overflow: Vec<FileInfo> = chunk_iter.collect();
186    if !overflow.is_empty() {
187        spill_store
188            .get_or_insert_with(FileInfoSpillStore::new)
189            .spill(overflow);
190    }
191}
192
193struct FileInfoSpillStore {
194    temp_dir: TempDir,
195    batch_index: usize,
196}
197
198impl FileInfoSpillStore {
199    fn new() -> Self {
200        Self {
201            temp_dir: TempDir::new().expect("create spill dir"),
202            batch_index: 0,
203        }
204    }
205
206    fn spill(&mut self, files: Vec<FileInfo>) {
207        let path = self
208            .temp_dir
209            .path()
210            .join(format!("batch-{:06}.json.zst", self.batch_index));
211        self.batch_index += 1;
212
213        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
214        let file = File::create(path).expect("create spill batch file");
215        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
216        encoder
217            .write_all(&payload)
218            .expect("write spilled file batch");
219        encoder.finish().expect("finish spill encoder");
220    }
221
222    fn load_all(self) -> Vec<FileInfo> {
223        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
224            .expect("read spill dir")
225            .filter_map(Result::ok)
226            .map(|entry| entry.path())
227            .collect();
228        paths.sort();
229
230        let mut files = Vec::new();
231        for path in paths {
232            let file = File::open(path).expect("open spill batch");
233            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
234            let mut payload = Vec::new();
235            decoder.read_to_end(&mut payload).expect("read spill batch");
236            let mut batch: Vec<FileInfo> =
237                serde_json::from_slice(&payload).expect("decode spilled file batch");
238            files.append(&mut batch);
239        }
240        files
241    }
242}
243
244fn process_file(
245    path: &Path,
246    metadata: &fs::Metadata,
247    progress: &ScanProgress,
248    license_engine: Option<Arc<LicenseDetectionEngine>>,
249    license_options: LicenseScanOptions,
250    text_options: &TextDetectionOptions,
251) -> FileInfo {
252    let mut scan_errors: Vec<String> = vec![];
253    let mut file_info_builder = FileInfoBuilder::default();
254    let license_enabled = license_engine.is_some();
255
256    let started = Instant::now();
257
258    let mut generated_flag = None;
259    let mut is_source_file = false;
260    match extract_information_from_content(
261        &mut file_info_builder,
262        &mut scan_errors,
263        path,
264        progress,
265        license_engine,
266        license_options,
267        text_options,
268    ) {
269        Ok((is_generated, sha256, is_source)) => {
270            generated_flag = is_generated;
271            is_source_file = is_source;
272            let _ = sha256;
273        }
274        Err(e) => scan_errors.push(e.to_string()),
275    };
276
277    if is_timeout_exceeded(started, text_options.timeout_seconds) {
278        scan_errors.push(format!(
279            "Processing interrupted due to timeout after {:.2} seconds",
280            text_options.timeout_seconds
281        ));
282    }
283
284    let mut file_info = file_info_builder
285        .name(path.file_name().unwrap().to_string_lossy().to_string())
286        .base_name(
287            path.file_stem()
288                .unwrap_or_default()
289                .to_string_lossy()
290                .to_string(),
291        )
292        .extension(
293            path.extension()
294                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
295        )
296        .path(path.to_string_lossy().to_string())
297        .file_type(FileType::File)
298        .size(metadata.len())
299        .date(
300            text_options
301                .collect_info
302                .then(|| get_creation_date(metadata))
303                .flatten(),
304        )
305        .scan_errors(scan_errors)
306        .build()
307        .expect("FileInformationBuild not completely initialized");
308
309    if text_options.collect_info {
310        file_info.is_source = Some(is_source_file);
311    }
312
313    if file_info.programming_language.as_deref() == Some("Go")
314        && is_go_non_production_source(path).unwrap_or(false)
315    {
316        file_info.is_source = Some(false);
317    }
318
319    if text_options.detect_generated {
320        file_info.is_generated = Some(generated_flag.unwrap_or(false));
321    }
322
323    if file_info.percentage_of_license_text.is_none() && license_enabled {
324        file_info.percentage_of_license_text = Some(0.0);
325    }
326
327    file_info
328}
329
330fn extract_information_from_content(
331    file_info_builder: &mut FileInfoBuilder,
332    scan_errors: &mut Vec<String>,
333    path: &Path,
334    progress: &ScanProgress,
335    license_engine: Option<Arc<LicenseDetectionEngine>>,
336    license_options: LicenseScanOptions,
337    text_options: &TextDetectionOptions,
338) -> Result<(Option<bool>, String, bool), Error> {
339    let started = Instant::now();
340    let buffer = fs::read(path)?;
341    let license_enabled = license_engine.is_some();
342
343    if is_timeout_exceeded(started, text_options.timeout_seconds) {
344        return Err(Error::msg(format!(
345            "Timeout while reading file content (> {:.2}s)",
346            text_options.timeout_seconds
347        )));
348    }
349
350    let sha256 = calculate_sha256(&buffer);
351    let is_generated = text_options
352        .detect_generated
353        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
354    let classification = classify_file_info(path, &buffer);
355
356    if text_options.collect_info {
357        file_info_builder
358            .sha1(Some(calculate_sha1(&buffer)))
359            .md5(Some(calculate_md5(&buffer)))
360            .sha256(Some(sha256.clone()))
361            .programming_language(classification.programming_language.clone())
362            .mime_type(Some(classification.mime_type.clone()))
363            .file_type_label(Some(classification.file_type.clone()))
364            .sha1_git(Some(calculate_sha1_git(&buffer)))
365            .is_binary(Some(classification.is_binary))
366            .is_text(Some(classification.is_text))
367            .is_archive(Some(classification.is_archive))
368            .is_media(Some(classification.is_media))
369            .is_source(Some(classification.is_source))
370            .is_script(Some(classification.is_script))
371            .files_count(Some(0))
372            .dirs_count(Some(0))
373            .size_count(Some(0));
374    }
375
376    if should_skip_text_detection(path, &buffer) {
377        return Ok((is_generated, sha256, classification.is_source));
378    }
379
380    // Package parsing and text-based detection (copyright, license) are independent.
381    // Python ScanCode runs all enabled plugins on every file, so we do the same.
382    if text_options.detect_packages {
383        let started = Instant::now();
384        let parse_result = try_parse_file(path).or_else(|| {
385            text_options
386                .detect_packages_in_compiled
387                .then(|| try_parse_compiled_bytes(&buffer))
388                .flatten()
389        });
390
391        if let Some(parse_result) = parse_result {
392            let packages = parse_result
393                .packages
394                .into_iter()
395                .filter(|package| {
396                    let is_compiled_package = package
397                        .datasource_id
398                        .as_ref()
399                        .is_some_and(is_compiled_datasource);
400                    let is_system_package = package
401                        .datasource_id
402                        .as_ref()
403                        .is_some_and(is_system_datasource);
404                    if is_compiled_package {
405                        text_options.detect_packages_in_compiled
406                    } else if is_system_package {
407                        text_options.detect_system_packages
408                    } else {
409                        text_options.detect_application_packages
410                    }
411                })
412                .collect();
413            file_info_builder.package_data(packages);
414            scan_errors.extend(parse_result.scan_errors);
415        }
416        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
417    }
418
419    if is_timeout_exceeded(started, text_options.timeout_seconds) {
420        return Err(Error::msg(format!(
421            "Timeout while extracting package/text metadata (> {:.2}s)",
422            text_options.timeout_seconds
423        )));
424    }
425
426    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
427    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
428
429    if is_timeout_exceeded(started, text_options.timeout_seconds) {
430        return Err(Error::msg(format!(
431            "Timeout while extracting text content (> {:.2}s)",
432            text_options.timeout_seconds
433        )));
434    }
435
436    if text_content.is_empty() {
437        return Ok((is_generated, sha256, classification.is_source));
438    }
439
440    if text_options.detect_copyrights {
441        extract_copyright_information(
442            file_info_builder,
443            path,
444            &text_content,
445            text_options.timeout_seconds,
446            from_binary_strings,
447        );
448    }
449    extract_email_url_information(
450        file_info_builder,
451        &text_content,
452        text_options,
453        from_binary_strings,
454    );
455
456    if is_timeout_exceeded(started, text_options.timeout_seconds) {
457        return Err(Error::msg(format!(
458            "Timeout before license scan (> {:.2}s)",
459            text_options.timeout_seconds
460        )));
461    }
462    // Handle source map files specially
463    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
464        if let Some(sourcemap_content) =
465            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
466        {
467            sourcemap_content
468        } else {
469            text_content
470        }
471    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
472        remove_verbatim_escape_sequences(&text_content)
473    } else {
474        text_content
475    };
476
477    if license_enabled {
478        let started = Instant::now();
479        extract_license_information(
480            file_info_builder,
481            scan_errors,
482            path,
483            text_content_for_license_detection,
484            license_engine,
485            license_options,
486            from_binary_strings,
487        )?;
488        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
489    } else {
490        extract_license_information(
491            file_info_builder,
492            scan_errors,
493            path,
494            text_content_for_license_detection,
495            license_engine,
496            license_options,
497            from_binary_strings,
498        )?;
499    }
500
501    Ok((is_generated, sha256, classification.is_source))
502}
503
504fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
505    timeout_seconds.is_finite()
506        && timeout_seconds > 0.0
507        && started.elapsed().as_secs_f64() > timeout_seconds
508}
509
510fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
511    matches!(
512        datasource_id,
513        DatasourceId::AlpineInstalledDb
514            | DatasourceId::DebianDistrolessInstalledDb
515            | DatasourceId::DebianInstalledFilesList
516            | DatasourceId::DebianInstalledMd5Sums
517            | DatasourceId::DebianInstalledStatusDb
518            | DatasourceId::FreebsdCompactManifest
519            | DatasourceId::RpmInstalledDatabaseBdb
520            | DatasourceId::RpmInstalledDatabaseNdb
521            | DatasourceId::RpmInstalledDatabaseSqlite
522            | DatasourceId::RpmYumdb
523    )
524}
525
526fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
527    matches!(
528        datasource_id,
529        DatasourceId::GoBinary | DatasourceId::RustBinary
530    )
531}
532
533fn extract_copyright_information(
534    file_info_builder: &mut FileInfoBuilder,
535    path: &Path,
536    text_content: &str,
537    timeout_seconds: f64,
538    from_binary_strings: bool,
539) {
540    // CREDITS files get special handling (Linux kernel style).
541    if copyright::is_credits_file(path) {
542        let author_detections = copyright::detect_credits_authors(text_content);
543        if !author_detections.is_empty() {
544            file_info_builder.authors(
545                author_detections
546                    .into_iter()
547                    .map(|a| Author {
548                        author: a.author,
549                        start_line: a.start_line,
550                        end_line: a.end_line,
551                    })
552                    .collect(),
553            );
554            return;
555        }
556    }
557
558    let copyright_options = CopyrightDetectionOptions {
559        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
560            Some(Duration::from_secs_f64(timeout_seconds))
561        } else {
562            None
563        },
564        ..CopyrightDetectionOptions::default()
565    };
566
567    let (copyrights, holders, authors) =
568        copyright::detect_copyrights_with_options(text_content, &copyright_options);
569    let (copyrights, holders, authors) = if from_binary_strings {
570        prune_binary_string_detections(copyrights, holders, authors)
571    } else {
572        (copyrights, holders, authors)
573    };
574
575    file_info_builder.copyrights(
576        copyrights
577            .into_iter()
578            .map(|c| Copyright {
579                copyright: c.copyright,
580                start_line: c.start_line,
581                end_line: c.end_line,
582            })
583            .collect::<Vec<Copyright>>(),
584    );
585    file_info_builder.holders(
586        holders
587            .into_iter()
588            .map(|h| Holder {
589                holder: h.holder,
590                start_line: h.start_line,
591                end_line: h.end_line,
592            })
593            .collect::<Vec<Holder>>(),
594    );
595    file_info_builder.authors(
596        authors
597            .into_iter()
598            .map(|a| Author {
599                author: a.author,
600                start_line: a.start_line,
601                end_line: a.end_line,
602            })
603            .collect::<Vec<Author>>(),
604    );
605}
606
607fn prune_binary_string_detections(
608    copyrights: Vec<CopyrightDetection>,
609    holders: Vec<HolderDetection>,
610    _authors: Vec<AuthorDetection>,
611) -> (
612    Vec<CopyrightDetection>,
613    Vec<HolderDetection>,
614    Vec<AuthorDetection>,
615) {
616    let kept_copyrights: Vec<CopyrightDetection> = copyrights
617        .into_iter()
618        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
619        .collect();
620
621    let kept_holders: Vec<HolderDetection> = holders
622        .into_iter()
623        .filter(|holder| {
624            kept_copyrights.iter().any(|copyright| {
625                ranges_overlap(
626                    holder.start_line,
627                    holder.end_line,
628                    copyright.start_line,
629                    copyright.end_line,
630                )
631            })
632        })
633        .collect();
634
635    (kept_copyrights, kept_holders, Vec::new())
636}
637
638fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
639    a_start <= b_end && b_start <= a_end
640}
641
642fn is_binary_string_copyright_candidate(text: &str) -> bool {
643    if contains_year(text) {
644        return true;
645    }
646
647    let lower = text.to_ascii_lowercase();
648    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
649        tail.trim()
650    } else {
651        lower.trim()
652    };
653
654    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
655        return false;
656    }
657
658    let alpha_tokens: Vec<&str> = tail
659        .split_whitespace()
660        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
661        .collect();
662
663    if alpha_tokens.len() <= 1 {
664        return has_explicit_copyright_marker(text)
665            && alpha_tokens.iter().any(|token| {
666                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
667            });
668    }
669
670    if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
671        return true;
672    }
673
674    alpha_tokens
675        .iter()
676        .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
677        || alpha_tokens
678            .iter()
679            .filter(|token| token.chars().filter(|c| c.is_alphabetic()).count() >= 3)
680            .count()
681            >= 2
682}
683
684fn has_sufficient_alphabetic_content(text: &str) -> bool {
685    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
686    if alnum_count == 0 {
687        return false;
688    }
689
690    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
691    alpha_count * 2 >= alnum_count
692}
693
694fn has_excessive_at_noise(text: &str) -> bool {
695    text.chars().filter(|c| *c == '@').count() >= 3
696}
697
698fn has_explicit_copyright_marker(text: &str) -> bool {
699    let lower = text.to_ascii_lowercase();
700    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
701}
702
703fn contains_year(text: &str) -> bool {
704    let bytes = text.as_bytes();
705    bytes.windows(4).any(|window| {
706        window.iter().all(|b| b.is_ascii_digit())
707            && matches!(window[0], b'1' | b'2')
708            && matches!(window[1], b'9' | b'0')
709    })
710}
711
712fn is_company_like_suffix(token: &str) -> bool {
713    matches!(
714        token.to_ascii_lowercase().as_str(),
715        "inc"
716            | "corp"
717            | "corporation"
718            | "co"
719            | "company"
720            | "ltd"
721            | "llc"
722            | "gmbh"
723            | "foundation"
724            | "project"
725            | "systems"
726            | "software"
727            | "technologies"
728            | "technology"
729    )
730}
731
732fn extract_email_url_information(
733    file_info_builder: &mut FileInfoBuilder,
734    text_content: &str,
735    text_options: &TextDetectionOptions,
736    from_binary_strings: bool,
737) {
738    if !text_options.detect_emails && !text_options.detect_urls {
739        return;
740    }
741
742    if text_options.detect_emails {
743        let config = DetectionConfig {
744            max_emails: text_options.max_emails,
745            max_urls: text_options.max_urls,
746            unique: false,
747        };
748        let emails = finder::find_emails(text_content, &config)
749            .into_iter()
750            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
751            .map(|d| OutputEmail {
752                email: d.email,
753                start_line: d.start_line,
754                end_line: d.end_line,
755            })
756            .collect::<Vec<_>>();
757        file_info_builder.emails(emails);
758    }
759
760    if text_options.detect_urls {
761        let config = DetectionConfig {
762            max_emails: text_options.max_emails,
763            max_urls: text_options.max_urls,
764            unique: true,
765        };
766        let urls = finder::find_urls(text_content, &config)
767            .into_iter()
768            .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
769            .map(|d| OutputURL {
770                url: d.url,
771                start_line: d.start_line,
772                end_line: d.end_line,
773            })
774            .collect::<Vec<_>>();
775        file_info_builder.urls(urls);
776    }
777}
778
779fn is_binary_string_email_candidate(email: &str) -> bool {
780    let Some((local, domain)) = email.rsplit_once('@') else {
781        return false;
782    };
783
784    if !has_strong_binary_local_part(local) {
785        return false;
786    }
787
788    has_strong_binary_host_shape(domain)
789}
790
791fn is_binary_string_url_candidate(url: &str) -> bool {
792    let parsed = url::Url::parse(url).ok();
793    let Some(parsed) = parsed else {
794        return false;
795    };
796    let Some(host) = parsed.host_str() else {
797        return false;
798    };
799
800    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
801}
802
803fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
804    if parsed.path() != "/"
805        && parsed
806            .path()
807            .split('/')
808            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 3)
809    {
810        return true;
811    }
812
813    if parsed.query().is_some() || parsed.fragment().is_some() {
814        return true;
815    }
816
817    let Some(host) = parsed.host_str() else {
818        return false;
819    };
820
821    let labels: Vec<&str> = host.split('.').collect();
822    if matches!(labels.first(), Some(&"www")) {
823        return true;
824    }
825
826    labels
827        .iter()
828        .take(labels.len().saturating_sub(1))
829        .any(|label| {
830            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
831        })
832}
833
834fn has_strong_binary_local_part(local: &str) -> bool {
835    local
836        .split(|c: char| !c.is_ascii_alphabetic())
837        .any(|segment| segment.len() >= 3)
838}
839
840fn has_strong_binary_host_shape(host: &str) -> bool {
841    let labels: Vec<&str> = host.split('.').collect();
842    if labels.len() < 2 {
843        return false;
844    }
845
846    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
847        &labels[1..]
848    } else {
849        &labels[..]
850    };
851
852    if relevant.len() < 2 {
853        return false;
854    }
855
856    relevant[..relevant.len() - 1].iter().any(|label| {
857        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
858    })
859}
860
861fn extract_license_information(
862    file_info_builder: &mut FileInfoBuilder,
863    scan_errors: &mut Vec<String>,
864    path: &Path,
865    text_content: String,
866    license_engine: Option<Arc<LicenseDetectionEngine>>,
867    license_options: LicenseScanOptions,
868    from_binary_strings: bool,
869) -> Result<(), Error> {
870    let Some(engine) = license_engine else {
871        return Ok(());
872    };
873
874    let detection_result = if license_options.min_score == 0 {
875        engine.detect_with_kind_and_source(
876            &text_content,
877            license_options.unknown_licenses,
878            from_binary_strings,
879            &path.to_string_lossy(),
880        )
881    } else {
882        engine.detect_with_kind_and_source_with_score(
883            &text_content,
884            license_options.unknown_licenses,
885            from_binary_strings,
886            &path.to_string_lossy(),
887            license_options.min_score as f32,
888        )
889    };
890
891    match detection_result {
892        Ok(detections) => {
893            let query =
894                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
895            let mut model_detections = Vec::new();
896            let mut model_clues = Vec::new();
897
898            for detection in &detections {
899                let (public_detection, clue_matches) = convert_detection_to_model(
900                    detection,
901                    license_options,
902                    &text_content,
903                    query.as_ref(),
904                );
905
906                if let Some(public_detection) = public_detection {
907                    model_detections.push(public_detection);
908                }
909
910                model_clues.extend(clue_matches);
911            }
912
913            if !model_detections.is_empty() {
914                let expressions: Vec<String> = model_detections
915                    .iter()
916                    .filter(|d| !d.license_expression_spdx.is_empty())
917                    .map(|d| d.license_expression_spdx.clone())
918                    .collect();
919
920                if !expressions.is_empty() {
921                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
922                    if let Some(expr) = combined {
923                        file_info_builder.license_expression(Some(expr));
924                    }
925                }
926            }
927
928            file_info_builder.license_detections(model_detections);
929            file_info_builder.license_clues(model_clues);
930            file_info_builder.percentage_of_license_text(
931                query
932                    .as_ref()
933                    .map(|query| compute_percentage_of_license_text(query, &detections)),
934            );
935        }
936        Err(e) => {
937            scan_errors.push(format!("License detection failed: {}", e));
938        }
939    }
940
941    Ok(())
942}
943
944fn convert_detection_to_model(
945    detection: &crate::license_detection::LicenseDetection,
946    license_options: LicenseScanOptions,
947    text_content: &str,
948    query: Option<&Query<'_>>,
949) -> (Option<LicenseDetection>, Vec<Match>) {
950    let matches: Vec<Match> = detection
951        .matches
952        .iter()
953        .map(|m| convert_match_to_model(m, license_options, text_content, query))
954        .collect();
955
956    if let Some(license_expression) = detection.license_expression.clone() {
957        (
958            Some(LicenseDetection {
959                license_expression,
960                license_expression_spdx: detection
961                    .license_expression_spdx
962                    .clone()
963                    .unwrap_or_default(),
964                matches,
965                detection_log: if license_options.include_diagnostics {
966                    detection.detection_log.clone()
967                } else {
968                    Vec::new()
969                },
970                identifier: detection.identifier.clone(),
971            }),
972            Vec::new(),
973        )
974    } else {
975        (None, matches)
976    }
977}
978
979fn convert_match_to_model(
980    m: &crate::license_detection::models::LicenseMatch,
981    license_options: LicenseScanOptions,
982    text_content: &str,
983    query: Option<&Query<'_>>,
984) -> Match {
985    let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
986    let rule_url = if m.rule_url.is_empty() {
987        None
988    } else {
989        Some(m.rule_url.clone())
990    };
991    let matched_text = if license_options.include_text {
992        m.matched_text.clone().or_else(|| {
993            Some(crate::license_detection::query::matched_text_from_text(
994                text_content,
995                m.start_line,
996                m.end_line,
997            ))
998        })
999    } else {
1000        None
1001    };
1002    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1003        query.map(|query| matched_text_diagnostics_from_match(query, m))
1004    } else {
1005        None
1006    };
1007    Match {
1008        license_expression: m.license_expression.clone(),
1009        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1010        from_file: m.from_file.clone(),
1011        start_line: m.start_line,
1012        end_line: m.end_line,
1013        matcher: Some(m.matcher.to_string()),
1014        score: output_metric(m.score),
1015        matched_length: Some(m.matched_length),
1016        match_coverage: Some(output_metric(m.coverage())),
1017        rule_relevance: Some(m.rule_relevance as usize),
1018        rule_identifier: Some(m.rule_identifier.clone()),
1019        rule_url,
1020        matched_text,
1021        referenced_filenames: m.referenced_filenames.clone(),
1022        matched_text_diagnostics,
1023    }
1024}
1025
1026fn compute_percentage_of_license_text(
1027    query: &Query<'_>,
1028    detections: &[crate::license_detection::LicenseDetection],
1029) -> f64 {
1030    let matched_positions: std::collections::HashSet<usize> = detections
1031        .iter()
1032        .flat_map(|detection| detection.matches.iter())
1033        .flat_map(|m| m.query_span().iter())
1034        .collect();
1035
1036    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1037    if query_tokens_length == 0 {
1038        return 0.0;
1039    }
1040
1041    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1042    (percentage * 100.0).round() / 100.0
1043}
1044
1045fn matched_text_diagnostics_from_match(
1046    query: &Query<'_>,
1047    license_match: &InternalLicenseMatch,
1048) -> String {
1049    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1050    let Some(start_pos) = matched_positions.iter().min() else {
1051        return crate::license_detection::query::matched_text_from_text(
1052            &query.text,
1053            license_match.start_line,
1054            license_match.end_line,
1055        );
1056    };
1057    let Some(end_pos) = matched_positions.iter().max() else {
1058        return crate::license_detection::query::matched_text_from_text(
1059            &query.text,
1060            license_match.start_line,
1061            license_match.end_line,
1062        );
1063    };
1064
1065    crate::license_detection::query::matched_text_diagnostics_from_text(
1066        &query.text,
1067        query,
1068        &matched_positions,
1069        start_pos,
1070        end_pos,
1071        license_match.start_line,
1072        license_match.end_line,
1073    )
1074}
1075
1076fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1077    is_pem_certificate_file(path, buffer)
1078}
1079
1080fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1081    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1082        return Ok(false);
1083    }
1084
1085    if path
1086        .file_name()
1087        .and_then(|name| name.to_str())
1088        .is_some_and(|name| name.ends_with("_test.go"))
1089    {
1090        return Ok(true);
1091    }
1092
1093    let content = fs::read_to_string(path)?;
1094    Ok(content.lines().take(10).any(|line| {
1095        let trimmed = line.trim();
1096        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1097            && trimmed.split_whitespace().any(|token| token == "test")
1098    }))
1099}
1100
1101fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1102    let prefix_len = buffer.len().min(8192);
1103    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1104    let trimmed_lines: Vec<&str> = prefix
1105        .lines()
1106        .map(str::trim)
1107        .filter(|line| !line.is_empty())
1108        .take(64)
1109        .collect();
1110
1111    let Some(first_line) = trimmed_lines.first().copied() else {
1112        return false;
1113    };
1114
1115    PEM_CERTIFICATE_HEADERS
1116        .iter()
1117        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1118}
1119
1120fn process_directory(
1121    path: &Path,
1122    _metadata: &fs::Metadata,
1123    collect_info: bool,
1124    license_enabled: bool,
1125) -> FileInfo {
1126    let name = path
1127        .file_name()
1128        .unwrap_or_default()
1129        .to_string_lossy()
1130        .to_string();
1131    let base_name = name.clone(); // For directories, base_name is the same as name
1132
1133    FileInfo {
1134        name,
1135        base_name,
1136        extension: "".to_string(),
1137        path: path.to_string_lossy().to_string(),
1138        file_type: FileType::Directory,
1139        mime_type: None,
1140        file_type_label: None,
1141        size: 0,
1142        date: None,
1143        sha1: None,
1144        md5: None,
1145        sha256: None,
1146        sha1_git: None,
1147        programming_language: None,
1148        package_data: Vec::new(),
1149        license_expression: None,
1150        license_detections: Vec::new(),
1151        license_clues: Vec::new(),
1152        percentage_of_license_text: license_enabled.then_some(0.0),
1153        copyrights: Vec::new(),
1154        holders: Vec::new(),
1155        authors: Vec::new(),
1156        emails: Vec::new(),
1157        urls: Vec::new(),
1158        for_packages: Vec::new(),
1159        scan_errors: Vec::new(),
1160        license_policy: None,
1161        is_binary: collect_info.then_some(false),
1162        is_text: collect_info.then_some(false),
1163        is_archive: collect_info.then_some(false),
1164        is_media: collect_info.then_some(false),
1165        is_source: collect_info.then_some(false),
1166        is_script: collect_info.then_some(false),
1167        files_count: collect_info.then_some(0),
1168        dirs_count: collect_info.then_some(0),
1169        size_count: collect_info.then_some(0),
1170        source_count: None,
1171        is_legal: false,
1172        is_manifest: false,
1173        is_readme: false,
1174        is_top_level: false,
1175        is_key_file: false,
1176        is_community: false,
1177        is_generated: None,
1178        facets: vec![],
1179        tallies: None,
1180    }
1181}
1182
1183#[cfg(test)]
1184mod tests {
1185    use super::{
1186        compute_percentage_of_license_text, convert_detection_to_model,
1187        extract_email_url_information, is_binary_string_copyright_candidate,
1188        is_binary_string_email_candidate, is_binary_string_url_candidate,
1189        is_go_non_production_source,
1190    };
1191    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1192    use crate::license_detection::index::LicenseIndex;
1193    use crate::license_detection::index::dictionary::TokenDictionary;
1194    use crate::license_detection::models::position_span::PositionSpan;
1195    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1196    use crate::license_detection::query::Query;
1197    use crate::models::{FileInfoBuilder, FileType};
1198    use crate::scanner::scan_options_fingerprint;
1199    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1200    use std::fs;
1201    use tempfile::tempdir;
1202
1203    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1204        LicenseMatch {
1205            rid: 0,
1206            license_expression: "mit".to_string(),
1207            license_expression_spdx: Some("MIT".to_string()),
1208            from_file: None,
1209            start_line: 1,
1210            end_line: 1,
1211            start_token: 0,
1212            end_token: 1,
1213            matcher: MatcherKind::Hash,
1214            score: 1.0,
1215            matched_length: 3,
1216            rule_length: 3,
1217            match_coverage: 100.0,
1218            rule_relevance: 100,
1219            rule_identifier: "mit.LICENSE".to_string(),
1220            rule_url: rule_url.to_string(),
1221            matched_text: Some("MIT".to_string()),
1222            referenced_filenames: None,
1223            rule_kind: RuleKind::Text,
1224            is_from_license: true,
1225            rule_start_token: 0,
1226            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1227            candidate_resemblance: 0.0,
1228            candidate_containment: 0.0,
1229        }
1230    }
1231
1232    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1233        InternalLicenseDetection {
1234            license_expression: Some("mit".to_string()),
1235            license_expression_spdx: Some("MIT".to_string()),
1236            matches: vec![make_internal_match(rule_url)],
1237            detection_log: vec![],
1238            identifier: Some("mit-test".to_string()),
1239            file_regions: Vec::new(),
1240        }
1241    }
1242
1243    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1244        let dictionary = TokenDictionary::new_with_legalese(entries);
1245        let mut index = LicenseIndex::new(dictionary);
1246        index.len_legalese = len_legalese;
1247        index
1248    }
1249
1250    #[test]
1251    fn test_convert_detection_to_model_preserves_rule_url() {
1252        let detection = make_detection(
1253            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1254        );
1255
1256        let (converted, clues) =
1257            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1258        let converted = converted.expect("detection should convert");
1259
1260        assert_eq!(
1261            converted.matches[0].rule_url.as_deref(),
1262            Some(
1263                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1264            )
1265        );
1266        assert!(clues.is_empty());
1267    }
1268
1269    #[test]
1270    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1271        let detection = make_detection("");
1272
1273        let (converted, clues) =
1274            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1275        let converted = converted.expect("detection should convert");
1276
1277        assert_eq!(converted.matches[0].rule_url, None);
1278        assert!(clues.is_empty());
1279    }
1280
1281    #[test]
1282    fn test_convert_detection_to_model_rounds_match_coverage() {
1283        let mut detection = make_detection("");
1284        detection.matches[0].score = 81.82;
1285        detection.matches[0].match_coverage = 33.334;
1286
1287        let (converted, clues) =
1288            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1289        let converted = converted.expect("detection should convert");
1290
1291        assert_eq!(converted.matches[0].score, 81.82);
1292        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1293        assert!(clues.is_empty());
1294    }
1295
1296    #[test]
1297    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1298        let mut detection = make_detection(
1299            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1300        );
1301        detection.license_expression = None;
1302        detection.license_expression_spdx = None;
1303        detection.identifier = None;
1304        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1305        detection.matches[0].license_expression_spdx =
1306            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1307        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1308        detection.matches[0].rule_kind = RuleKind::Clue;
1309
1310        let (converted, clues) = convert_detection_to_model(
1311            &detection,
1312            LicenseScanOptions {
1313                include_text: true,
1314                min_score: 0,
1315                ..LicenseScanOptions::default()
1316            },
1317            "clue text",
1318            None,
1319        );
1320
1321        assert!(converted.is_none());
1322        assert_eq!(clues.len(), 1);
1323        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1324        assert_eq!(
1325            clues[0].license_expression_spdx,
1326            "LicenseRef-scancode-unknown-license-reference"
1327        );
1328        assert_eq!(
1329            clues[0].rule_identifier.as_deref(),
1330            Some("license-clue_1.RULE")
1331        );
1332        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1333        assert_eq!(clues[0].matched_text_diagnostics, None);
1334    }
1335
1336    #[test]
1337    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1338        let text = concat!(
1339            "Reproduction and distribution of this file, with or without modification, are\n",
1340            "permitted in any medium without royalties provided the copyright notice\n",
1341            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1342        );
1343        let index = create_test_index(
1344            &[
1345                ("reproduction", 0),
1346                ("distribution", 1),
1347                ("file", 2),
1348                ("without", 3),
1349                ("modification", 4),
1350                ("permitted", 5),
1351                ("medium", 6),
1352                ("royalties", 7),
1353                ("provided", 8),
1354                ("copyright", 9),
1355                ("notice", 10),
1356                ("preserved", 11),
1357                ("offered", 12),
1358                ("warranties", 13),
1359            ],
1360            14,
1361        );
1362        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1363        let mut detection = make_detection(
1364            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1365        );
1366        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1367        detection.matches[0].license_expression = "fsf-ap".to_string();
1368        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1369        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1370        detection.matches[0].matched_text = None;
1371        detection.matches[0].start_line = 1;
1372        detection.matches[0].end_line = 3;
1373        detection.matches[0].start_token = 0;
1374        detection.matches[0].end_token = query.tokens.len();
1375        detection.matches[0].coordinates =
1376            MatchCoordinates::query_region(PositionSpan::from_positions(
1377                query
1378                    .tokens
1379                    .iter()
1380                    .enumerate()
1381                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1382                    .collect::<Vec<_>>(),
1383            ));
1384        detection.identifier = Some("fsf_ap-test".to_string());
1385
1386        let (converted, clues) = convert_detection_to_model(
1387            &detection,
1388            LicenseScanOptions {
1389                include_text: true,
1390                include_text_diagnostics: true,
1391                include_diagnostics: true,
1392                unknown_licenses: false,
1393                min_score: 0,
1394            },
1395            text,
1396            Some(&query),
1397        );
1398        let converted = converted.expect("detection should convert");
1399
1400        assert!(clues.is_empty());
1401        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1402        assert_eq!(
1403            converted.matches[0].matched_text.as_deref(),
1404            Some(text.trim_end())
1405        );
1406        let diagnostics = converted.matches[0]
1407            .matched_text_diagnostics
1408            .as_deref()
1409            .expect("diagnostics should be present");
1410        assert!(diagnostics.contains('['));
1411        assert!(diagnostics.contains(']'));
1412        assert_ne!(diagnostics, text.trim_end());
1413    }
1414
1415    #[test]
1416    fn test_extract_email_url_information_skips_binary_string_text() {
1417        let mut builder = FileInfoBuilder::default();
1418        let options = TextDetectionOptions {
1419            collect_info: false,
1420            detect_packages: false,
1421            detect_application_packages: false,
1422            detect_system_packages: false,
1423            detect_packages_in_compiled: false,
1424            detect_copyrights: false,
1425            detect_generated: false,
1426            detect_emails: true,
1427            detect_urls: true,
1428            max_emails: 50,
1429            max_urls: 50,
1430            timeout_seconds: 120.0,
1431        };
1432
1433        extract_email_url_information(
1434            &mut builder,
1435            "contact 6h@fo.lwft and visit http://gmail.com/",
1436            &options,
1437            true,
1438        );
1439
1440        let file = builder
1441            .name("binary.bin".to_string())
1442            .base_name("binary".to_string())
1443            .extension(".bin".to_string())
1444            .path("binary.bin".to_string())
1445            .file_type(FileType::File)
1446            .size(1)
1447            .build()
1448            .expect("builder should produce file info");
1449
1450        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1451        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1452    }
1453
1454    #[test]
1455    fn test_extract_email_url_information_keeps_good_binary_contacts() {
1456        let mut builder = FileInfoBuilder::default();
1457        let options = TextDetectionOptions {
1458            collect_info: false,
1459            detect_packages: false,
1460            detect_application_packages: false,
1461            detect_system_packages: false,
1462            detect_packages_in_compiled: false,
1463            detect_copyrights: false,
1464            detect_generated: false,
1465            detect_emails: true,
1466            detect_urls: true,
1467            max_emails: 50,
1468            max_urls: 50,
1469            timeout_seconds: 120.0,
1470        };
1471
1472        extract_email_url_information(
1473            &mut builder,
1474            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1475            &options,
1476            true,
1477        );
1478
1479        let file = builder
1480            .name("binary.bin".to_string())
1481            .base_name("binary".to_string())
1482            .extension(".bin".to_string())
1483            .path("binary.bin".to_string())
1484            .file_type(FileType::File)
1485            .size(1)
1486            .build()
1487            .expect("builder should produce file info");
1488
1489        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1490        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1491        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1492        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1493    }
1494
1495    #[test]
1496    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1497        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1498        assert!(!is_binary_string_copyright_candidate(gibberish));
1499    }
1500
1501    #[test]
1502    fn test_binary_string_copyright_candidate_keeps_real_notice() {
1503        let notice = "Copyright nexB and others (c) 2012";
1504        assert!(is_binary_string_copyright_candidate(notice));
1505    }
1506
1507    #[test]
1508    fn test_binary_string_email_candidate_rejects_gibberish() {
1509        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1510    }
1511
1512    #[test]
1513    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1514        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1515    }
1516
1517    #[test]
1518    fn test_binary_string_url_candidate_rejects_short_fake_host() {
1519        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1520    }
1521
1522    #[test]
1523    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1524        assert!(is_binary_string_url_candidate(
1525            "https://www.gnu.org/software/coreutils/"
1526        ));
1527    }
1528
1529    #[test]
1530    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1531        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1532    }
1533
1534    #[test]
1535    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1536        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1537        let text = "alpha MIT omega";
1538        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1539        let mut detection = make_detection("");
1540        detection.matches[0].coordinates =
1541            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1542        detection.matches[0].start_token = 1;
1543        detection.matches[0].end_token = 2;
1544
1545        let percentage = compute_percentage_of_license_text(&query, &[detection]);
1546
1547        assert_eq!(percentage, 33.33);
1548    }
1549
1550    #[test]
1551    fn test_scan_options_fingerprint_changes_with_license_score() {
1552        let text_options = crate::scanner::TextDetectionOptions::default();
1553        let default_fingerprint = scan_options_fingerprint(
1554            &text_options,
1555            LicenseScanOptions {
1556                min_score: 0,
1557                ..LicenseScanOptions::default()
1558            },
1559            None,
1560        );
1561        let filtered_fingerprint = scan_options_fingerprint(
1562            &text_options,
1563            LicenseScanOptions {
1564                min_score: 70,
1565                ..LicenseScanOptions::default()
1566            },
1567            None,
1568        );
1569
1570        assert_ne!(default_fingerprint, filtered_fingerprint);
1571    }
1572
1573    #[test]
1574    fn test_is_go_non_production_source_for_test_filename() {
1575        let temp_dir = tempdir().unwrap();
1576        let path = temp_dir.path().join("scanner_test.go");
1577        fs::write(&path, "package scanner\n").unwrap();
1578
1579        assert!(is_go_non_production_source(&path).unwrap());
1580    }
1581
1582    #[test]
1583    fn test_is_go_non_production_source_for_build_tag() {
1584        let temp_dir = tempdir().unwrap();
1585        let path = temp_dir.path().join("scanner.go");
1586        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1587
1588        assert!(is_go_non_production_source(&path).unwrap());
1589    }
1590
1591    #[test]
1592    fn test_is_go_non_production_source_for_regular_go_file() {
1593        let temp_dir = tempdir().unwrap();
1594        let path = temp_dir.path().join("scanner.go");
1595        fs::write(&path, "package scanner\n").unwrap();
1596
1597        assert!(!is_go_non_production_source(&path).unwrap());
1598    }
1599}