Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::{try_parse_compiled_bytes, try_parse_file};
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha1_git, calculate_sha256};
4use crate::utils::text::{
5    remove_verbatim_escape_sequences, should_remove_verbatim_escape_sequences,
6};
7use anyhow::Error;
8use rayon::prelude::*;
9use std::collections::HashSet;
10use std::fs::{self, File};
11use std::io::{Read, Write};
12use std::path::Path;
13use std::sync::Arc;
14use std::time::{Duration, Instant};
15
16use crate::copyright::{
17    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
18};
19use crate::finder::{self, DetectionConfig};
20use crate::license_detection::PositionSet;
21use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
22use crate::license_detection::query::Query;
23use crate::models::{
24    Author, Copyright, DatasourceId, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection,
25    Match, OutputEmail, OutputURL,
26};
27use crate::parsers::utils::split_name_email;
28use crate::progress::ScanProgress;
29use crate::scanner::collect::CollectedPaths;
30use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
31use crate::utils::file::{
32    ExtractedTextKind, classify_file_info, extract_text_for_detection, get_creation_date,
33};
34use crate::utils::generated::generated_code_hints_from_bytes;
35use tempfile::TempDir;
36
37const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
38    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
39    (
40        "-----BEGIN TRUSTED CERTIFICATE-----",
41        "-----END TRUSTED CERTIFICATE-----",
42    ),
43];
44
45pub fn process_collected(
46    collected: &CollectedPaths,
47    progress: Arc<ScanProgress>,
48    license_engine: Option<Arc<LicenseDetectionEngine>>,
49    license_options: LicenseScanOptions,
50    text_options: &TextDetectionOptions,
51) -> ProcessResult {
52    let mut all_files: Vec<FileInfo> = collected
53        .files
54        .par_iter()
55        .map(|(path, metadata)| {
56            let file_entry = process_file(
57                path,
58                metadata,
59                progress.as_ref(),
60                license_engine.clone(),
61                license_options,
62                text_options,
63            );
64            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
65            file_entry
66        })
67        .collect();
68
69    for (path, metadata) in &collected.directories {
70        all_files.push(process_directory(
71            path,
72            metadata,
73            text_options.collect_info,
74            license_engine.is_some(),
75        ));
76    }
77
78    ProcessResult {
79        files: all_files,
80        excluded_count: collected.excluded_count,
81    }
82}
83
84pub fn process_collected_with_memory_limit(
85    collected: &CollectedPaths,
86    progress: Arc<ScanProgress>,
87    license_engine: Option<Arc<LicenseDetectionEngine>>,
88    license_options: LicenseScanOptions,
89    text_options: &TextDetectionOptions,
90    max_in_memory: i64,
91) -> ProcessResult {
92    if max_in_memory == 0 {
93        return process_collected(
94            collected,
95            progress,
96            license_engine,
97            license_options,
98            text_options,
99        );
100    }
101
102    let memory_limit = if max_in_memory < 0 {
103        0
104    } else {
105        max_in_memory as usize
106    };
107    let chunk_size = if max_in_memory < 0 {
108        256
109    } else {
110        memory_limit.max(1)
111    };
112
113    let mut retained_files = Vec::new();
114    let mut spill_store = None;
115
116    for chunk in collected.files.chunks(chunk_size) {
117        let processed_chunk: Vec<FileInfo> = chunk
118            .par_iter()
119            .map(|(path, metadata)| {
120                let file_entry = process_file(
121                    path,
122                    metadata,
123                    progress.as_ref(),
124                    license_engine.clone(),
125                    license_options,
126                    text_options,
127                );
128                progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
129                file_entry
130            })
131            .collect();
132
133        retain_or_spill_chunk(
134            processed_chunk,
135            &mut retained_files,
136            &mut spill_store,
137            memory_limit,
138        );
139    }
140
141    for (path, metadata) in &collected.directories {
142        let entry = process_directory(
143            path,
144            metadata,
145            text_options.collect_info,
146            license_engine.is_some(),
147        );
148        retain_or_spill_chunk(
149            vec![entry],
150            &mut retained_files,
151            &mut spill_store,
152            memory_limit,
153        );
154    }
155
156    if let Some(spill_store) = spill_store {
157        retained_files.extend(spill_store.load_all());
158    }
159
160    ProcessResult {
161        files: retained_files,
162        excluded_count: collected.excluded_count,
163    }
164}
165
166fn retain_or_spill_chunk(
167    chunk: Vec<FileInfo>,
168    retained_files: &mut Vec<FileInfo>,
169    spill_store: &mut Option<FileInfoSpillStore>,
170    memory_limit: usize,
171) {
172    if memory_limit == 0 {
173        spill_store
174            .get_or_insert_with(FileInfoSpillStore::new)
175            .spill(chunk);
176        return;
177    }
178
179    let remaining_capacity = memory_limit.saturating_sub(retained_files.len());
180    if remaining_capacity >= chunk.len() && spill_store.is_none() {
181        retained_files.extend(chunk);
182        return;
183    }
184
185    let mut chunk_iter = chunk.into_iter();
186    retained_files.extend(chunk_iter.by_ref().take(remaining_capacity));
187    let overflow: Vec<FileInfo> = chunk_iter.collect();
188    if !overflow.is_empty() {
189        spill_store
190            .get_or_insert_with(FileInfoSpillStore::new)
191            .spill(overflow);
192    }
193}
194
195struct FileInfoSpillStore {
196    temp_dir: TempDir,
197    batch_index: usize,
198}
199
200impl FileInfoSpillStore {
201    fn new() -> Self {
202        Self {
203            temp_dir: TempDir::new().expect("create spill dir"),
204            batch_index: 0,
205        }
206    }
207
208    fn spill(&mut self, files: Vec<FileInfo>) {
209        let path = self
210            .temp_dir
211            .path()
212            .join(format!("batch-{:06}.json.zst", self.batch_index));
213        self.batch_index += 1;
214
215        let payload = serde_json::to_vec(&files).expect("encode spilled file batch");
216        let file = File::create(path).expect("create spill batch file");
217        let mut encoder = zstd::Encoder::new(file, 3).expect("create spill encoder");
218        encoder
219            .write_all(&payload)
220            .expect("write spilled file batch");
221        encoder.finish().expect("finish spill encoder");
222    }
223
224    fn load_all(self) -> Vec<FileInfo> {
225        let mut paths: Vec<_> = fs::read_dir(self.temp_dir.path())
226            .expect("read spill dir")
227            .filter_map(Result::ok)
228            .map(|entry| entry.path())
229            .collect();
230        paths.sort();
231
232        let mut files = Vec::new();
233        for path in paths {
234            let file = File::open(path).expect("open spill batch");
235            let mut decoder = zstd::Decoder::new(file).expect("create spill decoder");
236            let mut payload = Vec::new();
237            decoder.read_to_end(&mut payload).expect("read spill batch");
238            let mut batch: Vec<FileInfo> =
239                serde_json::from_slice(&payload).expect("decode spilled file batch");
240            files.append(&mut batch);
241        }
242        files
243    }
244}
245
246fn process_file(
247    path: &Path,
248    metadata: &fs::Metadata,
249    progress: &ScanProgress,
250    license_engine: Option<Arc<LicenseDetectionEngine>>,
251    license_options: LicenseScanOptions,
252    text_options: &TextDetectionOptions,
253) -> FileInfo {
254    let mut scan_errors: Vec<String> = vec![];
255    let mut file_info_builder = FileInfoBuilder::default();
256    let license_enabled = license_engine.is_some();
257
258    let started = Instant::now();
259
260    let mut generated_flag = None;
261    let mut is_source_file = false;
262    match extract_information_from_content(
263        &mut file_info_builder,
264        &mut scan_errors,
265        path,
266        progress,
267        license_engine,
268        license_options,
269        text_options,
270    ) {
271        Ok((is_generated, sha256, is_source)) => {
272            generated_flag = is_generated;
273            is_source_file = is_source;
274            let _ = sha256;
275        }
276        Err(e) => scan_errors.push(e.to_string()),
277    };
278
279    if is_timeout_exceeded(started, text_options.timeout_seconds) {
280        scan_errors.push(format!(
281            "Processing interrupted due to timeout after {:.2} seconds",
282            text_options.timeout_seconds
283        ));
284    }
285
286    let mut file_info = file_info_builder
287        .name(path.file_name().unwrap().to_string_lossy().to_string())
288        .base_name(
289            path.file_stem()
290                .unwrap_or_default()
291                .to_string_lossy()
292                .to_string(),
293        )
294        .extension(
295            path.extension()
296                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
297        )
298        .path(path.to_string_lossy().to_string())
299        .file_type(FileType::File)
300        .size(metadata.len())
301        .date(
302            text_options
303                .collect_info
304                .then(|| get_creation_date(metadata))
305                .flatten(),
306        )
307        .scan_errors(scan_errors)
308        .build()
309        .expect("FileInformationBuild not completely initialized");
310
311    if text_options.collect_info {
312        file_info.is_source = Some(is_source_file);
313    }
314
315    if file_info.programming_language.as_deref() == Some("Go")
316        && is_go_non_production_source(path).unwrap_or(false)
317    {
318        file_info.is_source = Some(false);
319    }
320
321    if text_options.detect_generated {
322        file_info.is_generated = Some(generated_flag.unwrap_or(false));
323    }
324
325    if file_info.percentage_of_license_text.is_none() && license_enabled {
326        file_info.percentage_of_license_text = Some(0.0);
327    }
328
329    file_info
330}
331
332fn extract_information_from_content(
333    file_info_builder: &mut FileInfoBuilder,
334    scan_errors: &mut Vec<String>,
335    path: &Path,
336    progress: &ScanProgress,
337    license_engine: Option<Arc<LicenseDetectionEngine>>,
338    license_options: LicenseScanOptions,
339    text_options: &TextDetectionOptions,
340) -> Result<(Option<bool>, String, bool), Error> {
341    let started = Instant::now();
342    let buffer = fs::read(path)?;
343    let license_enabled = license_engine.is_some();
344
345    if is_timeout_exceeded(started, text_options.timeout_seconds) {
346        return Err(Error::msg(format!(
347            "Timeout while reading file content (> {:.2}s)",
348            text_options.timeout_seconds
349        )));
350    }
351
352    let sha256 = calculate_sha256(&buffer);
353    let is_generated = text_options
354        .detect_generated
355        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
356    let classification = classify_file_info(path, &buffer);
357
358    if text_options.collect_info {
359        file_info_builder
360            .sha1(Some(calculate_sha1(&buffer)))
361            .md5(Some(calculate_md5(&buffer)))
362            .sha256(Some(sha256.clone()))
363            .programming_language(classification.programming_language.clone())
364            .mime_type(Some(classification.mime_type.clone()))
365            .file_type_label(Some(classification.file_type.clone()))
366            .sha1_git(Some(calculate_sha1_git(&buffer)))
367            .is_binary(Some(classification.is_binary))
368            .is_text(Some(classification.is_text))
369            .is_archive(Some(classification.is_archive))
370            .is_media(Some(classification.is_media))
371            .is_source(Some(classification.is_source))
372            .is_script(Some(classification.is_script))
373            .files_count(Some(0))
374            .dirs_count(Some(0))
375            .size_count(Some(0));
376    }
377
378    if should_skip_text_detection(path, &buffer) {
379        return Ok((is_generated, sha256, classification.is_source));
380    }
381
382    // Package parsing and text-based detection (copyright, license) are independent.
383    // Python ScanCode runs all enabled plugins on every file, so we do the same.
384    if text_options.detect_packages {
385        let started = Instant::now();
386        let parse_result = try_parse_file(path).or_else(|| {
387            text_options
388                .detect_packages_in_compiled
389                .then(|| try_parse_compiled_bytes(&buffer))
390                .flatten()
391        });
392
393        if let Some(parse_result) = parse_result {
394            let packages = parse_result
395                .packages
396                .into_iter()
397                .filter(|package| {
398                    let is_compiled_package = package
399                        .datasource_id
400                        .as_ref()
401                        .is_some_and(is_compiled_datasource);
402                    let is_system_package = package
403                        .datasource_id
404                        .as_ref()
405                        .is_some_and(is_system_datasource);
406                    if is_compiled_package {
407                        text_options.detect_packages_in_compiled
408                    } else if is_system_package {
409                        text_options.detect_system_packages
410                    } else {
411                        text_options.detect_application_packages
412                    }
413                })
414                .collect();
415            file_info_builder.package_data(packages);
416            scan_errors.extend(parse_result.scan_errors);
417        }
418        progress.record_detail_timing("scan:packages", started.elapsed().as_secs_f64());
419    }
420
421    if is_timeout_exceeded(started, text_options.timeout_seconds) {
422        return Err(Error::msg(format!(
423            "Timeout while extracting package/text metadata (> {:.2}s)",
424            text_options.timeout_seconds
425        )));
426    }
427
428    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
429    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
430
431    if is_timeout_exceeded(started, text_options.timeout_seconds) {
432        return Err(Error::msg(format!(
433            "Timeout while extracting text content (> {:.2}s)",
434            text_options.timeout_seconds
435        )));
436    }
437
438    if text_content.is_empty() {
439        return Ok((is_generated, sha256, classification.is_source));
440    }
441
442    if text_options.detect_copyrights {
443        extract_copyright_information(
444            file_info_builder,
445            path,
446            &text_content,
447            text_options.timeout_seconds,
448            from_binary_strings,
449        );
450    }
451    extract_email_url_information(
452        file_info_builder,
453        &text_content,
454        text_options,
455        from_binary_strings,
456    );
457
458    if is_timeout_exceeded(started, text_options.timeout_seconds) {
459        return Err(Error::msg(format!(
460            "Timeout before license scan (> {:.2}s)",
461            text_options.timeout_seconds
462        )));
463    }
464    // Handle source map files specially
465    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
466        if let Some(sourcemap_content) =
467            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
468        {
469            sourcemap_content
470        } else {
471            text_content
472        }
473    } else if should_remove_verbatim_escape_sequences(path, classification.is_source) {
474        remove_verbatim_escape_sequences(&text_content)
475    } else {
476        text_content
477    };
478
479    if license_enabled {
480        let started = Instant::now();
481        extract_license_information(
482            file_info_builder,
483            scan_errors,
484            path,
485            text_content_for_license_detection,
486            license_engine,
487            license_options,
488            from_binary_strings,
489        )?;
490        progress.record_detail_timing("scan:licenses", started.elapsed().as_secs_f64());
491    } else {
492        extract_license_information(
493            file_info_builder,
494            scan_errors,
495            path,
496            text_content_for_license_detection,
497            license_engine,
498            license_options,
499            from_binary_strings,
500        )?;
501    }
502
503    Ok((is_generated, sha256, classification.is_source))
504}
505
506fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
507    timeout_seconds.is_finite()
508        && timeout_seconds > 0.0
509        && started.elapsed().as_secs_f64() > timeout_seconds
510}
511
512fn is_system_datasource(datasource_id: &DatasourceId) -> bool {
513    matches!(
514        datasource_id,
515        DatasourceId::AlpineInstalledDb
516            | DatasourceId::DebianDistrolessInstalledDb
517            | DatasourceId::DebianInstalledFilesList
518            | DatasourceId::DebianInstalledMd5Sums
519            | DatasourceId::DebianInstalledStatusDb
520            | DatasourceId::FreebsdCompactManifest
521            | DatasourceId::RpmInstalledDatabaseBdb
522            | DatasourceId::RpmInstalledDatabaseNdb
523            | DatasourceId::RpmInstalledDatabaseSqlite
524            | DatasourceId::RpmYumdb
525    )
526}
527
528fn is_compiled_datasource(datasource_id: &DatasourceId) -> bool {
529    matches!(
530        datasource_id,
531        DatasourceId::GoBinary | DatasourceId::RustBinary
532    )
533}
534
535fn extract_copyright_information(
536    file_info_builder: &mut FileInfoBuilder,
537    path: &Path,
538    text_content: &str,
539    timeout_seconds: f64,
540    from_binary_strings: bool,
541) {
542    // CREDITS files get special handling (Linux kernel style).
543    if copyright::is_credits_file(path) {
544        let author_detections = copyright::detect_credits_authors(text_content);
545        if !author_detections.is_empty() {
546            file_info_builder.authors(
547                author_detections
548                    .into_iter()
549                    .map(|a| Author {
550                        author: a.author,
551                        start_line: a.start_line,
552                        end_line: a.end_line,
553                    })
554                    .collect(),
555            );
556            return;
557        }
558    }
559
560    let copyright_options = CopyrightDetectionOptions {
561        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
562            Some(Duration::from_secs_f64(timeout_seconds))
563        } else {
564            None
565        },
566        ..CopyrightDetectionOptions::default()
567    };
568
569    let (copyrights, holders, authors) =
570        copyright::detect_copyrights_with_options(text_content, &copyright_options);
571    let (copyrights, holders, authors) = if from_binary_strings {
572        prune_binary_string_detections(text_content, copyrights, holders, authors)
573    } else {
574        (copyrights, holders, authors)
575    };
576
577    file_info_builder.copyrights(
578        copyrights
579            .into_iter()
580            .map(|c| Copyright {
581                copyright: c.copyright,
582                start_line: c.start_line,
583                end_line: c.end_line,
584            })
585            .collect::<Vec<Copyright>>(),
586    );
587    file_info_builder.holders(
588        holders
589            .into_iter()
590            .map(|h| Holder {
591                holder: h.holder,
592                start_line: h.start_line,
593                end_line: h.end_line,
594            })
595            .collect::<Vec<Holder>>(),
596    );
597    file_info_builder.authors(
598        authors
599            .into_iter()
600            .map(|a| Author {
601                author: a.author,
602                start_line: a.start_line,
603                end_line: a.end_line,
604            })
605            .collect::<Vec<Author>>(),
606    );
607}
608
609fn prune_binary_string_detections(
610    text_content: &str,
611    copyrights: Vec<CopyrightDetection>,
612    holders: Vec<HolderDetection>,
613    authors: Vec<AuthorDetection>,
614) -> (
615    Vec<CopyrightDetection>,
616    Vec<HolderDetection>,
617    Vec<AuthorDetection>,
618) {
619    let kept_copyrights: Vec<CopyrightDetection> = copyrights
620        .into_iter()
621        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
622        .collect();
623
624    let kept_holders: Vec<HolderDetection> = holders
625        .into_iter()
626        .filter(|holder| {
627            kept_copyrights.iter().any(|copyright| {
628                ranges_overlap(
629                    holder.start_line,
630                    holder.end_line,
631                    copyright.start_line,
632                    copyright.end_line,
633                )
634            })
635        })
636        .collect();
637
638    let kept_authors = authors
639        .into_iter()
640        .filter(|author| is_binary_string_author_candidate(&author.author))
641        .chain(extract_binary_string_author_supplements(text_content))
642        .filter({
643            let mut seen = HashSet::new();
644            move |author| seen.insert(author.author.clone())
645        })
646        .collect();
647
648    (kept_copyrights, kept_holders, kept_authors)
649}
650
651fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
652    a_start <= b_end && b_start <= a_end
653}
654
655fn is_binary_string_copyright_candidate(text: &str) -> bool {
656    if contains_year(text) {
657        return true;
658    }
659
660    let trimmed = text.trim();
661    let lower = trimmed.to_ascii_lowercase();
662    let tail = if let Some(tail) = lower.strip_prefix("copyright") {
663        tail.trim()
664    } else {
665        lower.trim()
666    };
667    let original_tail = if lower.starts_with("copyright") {
668        trimmed["copyright".len()..].trim()
669    } else {
670        trimmed
671    };
672
673    if tail.is_empty() || !has_sufficient_alphabetic_content(tail) || has_excessive_at_noise(tail) {
674        return false;
675    }
676
677    let alpha_tokens: Vec<&str> = tail
678        .split_whitespace()
679        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
680        .collect();
681
682    if alpha_tokens.len() <= 1 {
683        return has_explicit_copyright_marker(text)
684            && alpha_tokens.iter().any(|token| {
685                is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric()))
686            });
687    }
688
689    if !has_explicit_copyright_marker(text) {
690        return false;
691    }
692
693    has_binary_name_like_shape(original_tail)
694}
695
696fn extract_binary_string_author_supplements(text_content: &str) -> Vec<AuthorDetection> {
697    let mut authors = Vec::new();
698
699    for (line_index, line) in text_content.lines().enumerate() {
700        if let Some(author) = extract_named_author_from_binary_line(line) {
701            authors.push(AuthorDetection {
702                author,
703                start_line: line_index + 1,
704                end_line: line_index + 1,
705            });
706        }
707    }
708
709    authors
710}
711
712fn extract_named_author_from_binary_line(line: &str) -> Option<String> {
713    let line = line.trim();
714    if line.is_empty() {
715        return None;
716    }
717
718    let emails = finder::find_emails(
719        line,
720        &DetectionConfig {
721            max_emails: 4,
722            max_urls: 0,
723            unique: false,
724        },
725    );
726    let email = emails.first()?.email.as_str();
727    if !is_binary_string_email_candidate(email) {
728        return None;
729    }
730
731    let lower_line = line.to_ascii_lowercase();
732    let email_start = lower_line.find(email)?;
733    let raw_prefix = &line[..email_start];
734    let has_author_marker = contains_binary_author_marker(raw_prefix);
735    let prefix = take_suffix_after_last_author_marker(raw_prefix)?;
736    let prefix = prefix
737        .trim_start_matches(['*', '-', ':', ';', ',', '.', ' '])
738        .trim_end_matches(['<', '(', '[', ' ', ':', '-'])
739        .trim();
740
741    let (name, _) = split_name_email(prefix);
742    let name = name.or_else(|| {
743        let trimmed = prefix.trim_matches(|c: char| c == '<' || c == '(' || c == '[' || c == ' ');
744        (!trimmed.is_empty()).then(|| trimmed.to_string())
745    });
746
747    let Some(name) = name.map(|name| name.trim().to_string()) else {
748        if has_author_marker {
749            return Some(email.to_string());
750        }
751        return None;
752    };
753
754    if name.is_empty() && has_author_marker {
755        return Some(email.to_string());
756    }
757
758    if !has_binary_name_like_shape(&name) {
759        return None;
760    }
761
762    if line.contains(&format!("<{email}>")) {
763        Some(format!("{name} <{email}>"))
764    } else if line.contains(&format!("({email})")) {
765        Some(format!("{name} ({email})"))
766    } else {
767        Some(format!("{name} {email}"))
768    }
769}
770
771fn take_suffix_after_last_ascii_marker<'a>(text: &'a str, marker: &str) -> Option<&'a str> {
772    let lower = text.to_ascii_lowercase();
773    let idx = lower.rfind(marker)?;
774    Some(text[idx + marker.len()..].trim())
775}
776
777fn take_suffix_after_last_author_marker(text: &str) -> Option<&str> {
778    const MARKERS: &[&str] = &[
779        " patch author: ",
780        " patch author ",
781        " written by ",
782        " contributed by ",
783        " original work done by ",
784        " work done by ",
785        " thanks to ",
786        " review by ",
787        " by ",
788        " from ",
789    ];
790
791    MARKERS
792        .iter()
793        .filter_map(|marker| take_suffix_after_last_ascii_marker(text, marker))
794        .next()
795}
796
797fn contains_binary_author_marker(text: &str) -> bool {
798    take_suffix_after_last_author_marker(text).is_some()
799}
800
801fn has_binary_name_like_shape(text: &str) -> bool {
802    let trimmed = text.trim();
803    if trimmed.is_empty() || trimmed.contains(" - ") || trimmed.chars().any(|c| c.is_ascii_digit())
804    {
805        return false;
806    }
807
808    let tokens: Vec<&str> = trimmed
809        .split(|c: char| !c.is_ascii_alphabetic() && c != '.' && c != '\'')
810        .filter(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()))
811        .collect();
812    if tokens.is_empty() {
813        return false;
814    }
815
816    let uppercase_like = tokens
817        .iter()
818        .filter(|token| {
819            let token = token.trim_matches('.');
820            token
821                .chars()
822                .find(|c| c.is_ascii_alphabetic())
823                .is_some_and(|c| c.is_ascii_uppercase())
824        })
825        .count();
826
827    uppercase_like >= 2 && uppercase_like * 2 >= tokens.len()
828        || tokens
829            .iter()
830            .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
831}
832
833fn has_sufficient_alphabetic_content(text: &str) -> bool {
834    let alnum_count = text.chars().filter(|c| c.is_ascii_alphanumeric()).count();
835    if alnum_count == 0 {
836        return false;
837    }
838
839    let alpha_count = text.chars().filter(|c| c.is_ascii_alphabetic()).count();
840    alpha_count * 2 >= alnum_count
841}
842
843fn has_excessive_at_noise(text: &str) -> bool {
844    text.chars().filter(|c| *c == '@').count() >= 3
845}
846
847fn has_explicit_copyright_marker(text: &str) -> bool {
848    let lower = text.to_ascii_lowercase();
849    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
850}
851
852fn contains_year(text: &str) -> bool {
853    let bytes = text.as_bytes();
854    bytes.windows(4).any(|window| {
855        window.iter().all(|b| b.is_ascii_digit())
856            && matches!(window[0], b'1' | b'2')
857            && matches!(window[1], b'9' | b'0')
858    })
859}
860
861fn is_company_like_suffix(token: &str) -> bool {
862    matches!(
863        token.to_ascii_lowercase().as_str(),
864        "inc"
865            | "corp"
866            | "corporation"
867            | "co"
868            | "company"
869            | "ltd"
870            | "llc"
871            | "gmbh"
872            | "foundation"
873            | "project"
874            | "systems"
875            | "software"
876            | "technologies"
877            | "technology"
878    )
879}
880
881fn extract_email_url_information(
882    file_info_builder: &mut FileInfoBuilder,
883    text_content: &str,
884    text_options: &TextDetectionOptions,
885    from_binary_strings: bool,
886) {
887    if !text_options.detect_emails && !text_options.detect_urls {
888        return;
889    }
890
891    if text_options.detect_emails {
892        let config = DetectionConfig {
893            max_emails: text_options.max_emails,
894            max_urls: text_options.max_urls,
895            unique: from_binary_strings,
896        };
897        let emails = finder::find_emails(text_content, &config)
898            .into_iter()
899            .filter(|d| !from_binary_strings || is_binary_string_email_candidate(&d.email))
900            .map(|d| OutputEmail {
901                email: d.email,
902                start_line: d.start_line,
903                end_line: d.end_line,
904            })
905            .collect::<Vec<_>>();
906        file_info_builder.emails(emails);
907    }
908
909    if text_options.detect_urls {
910        let config = DetectionConfig {
911            max_emails: text_options.max_emails,
912            max_urls: text_options.max_urls,
913            unique: true,
914        };
915        let urls = finder::find_urls(text_content, &config)
916            .into_iter()
917            .filter(|d| !from_binary_strings || is_binary_string_url_candidate(&d.url))
918            .map(|d| OutputURL {
919                url: d.url,
920                start_line: d.start_line,
921                end_line: d.end_line,
922            })
923            .collect::<Vec<_>>();
924        file_info_builder.urls(urls);
925    }
926}
927
928fn is_binary_string_email_candidate(email: &str) -> bool {
929    let Some((local, domain)) = email.rsplit_once('@') else {
930        return false;
931    };
932
933    if !has_strong_binary_local_part(local) {
934        return false;
935    }
936
937    has_strong_binary_host_shape(domain)
938}
939
940fn is_binary_string_url_candidate(url: &str) -> bool {
941    let parsed = url::Url::parse(url).ok();
942    let Some(parsed) = parsed else {
943        return false;
944    };
945    let Some(host) = parsed.host_str() else {
946        return false;
947    };
948
949    has_strong_binary_host_shape(host) && has_meaningful_binary_url_context(&parsed)
950}
951
952fn is_binary_string_author_candidate(author: &str) -> bool {
953    let trimmed = author.trim();
954    if trimmed.is_empty()
955        || !has_sufficient_alphabetic_content(trimmed)
956        || has_excessive_at_noise(trimmed)
957    {
958        return false;
959    }
960
961    if trimmed.contains('@') {
962        let emails = finder::find_emails(
963            trimmed,
964            &DetectionConfig {
965                max_emails: 4,
966                max_urls: 0,
967                unique: true,
968            },
969        );
970        if emails.len() > 1 {
971            return false;
972        }
973
974        if let Some(extracted) = extract_named_author_from_binary_line(trimmed) {
975            return !extracted.is_empty();
976        }
977
978        let Some(email) = emails.first().map(|d| d.email.as_str()) else {
979            return false;
980        };
981        if !is_binary_string_email_candidate(email) {
982            return false;
983        }
984
985        let (name, _) = split_name_email(trimmed);
986        return name.as_deref().is_some_and(has_binary_name_like_shape);
987    }
988
989    has_binary_name_like_shape(trimmed)
990}
991
992fn has_meaningful_binary_url_context(parsed: &url::Url) -> bool {
993    if parsed.path() != "/"
994        && parsed
995            .path()
996            .split('/')
997            .any(|segment| segment.chars().any(|c| c.is_ascii_alphabetic()) && segment.len() >= 2)
998    {
999        return true;
1000    }
1001
1002    if parsed.query().is_some() || parsed.fragment().is_some() {
1003        return true;
1004    }
1005
1006    let Some(host) = parsed.host_str() else {
1007        return false;
1008    };
1009
1010    let labels: Vec<&str> = host.split('.').collect();
1011    if labels.len() > 2 {
1012        return labels[..labels.len() - 1].iter().any(|label| {
1013            label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1014        });
1015    }
1016
1017    if matches!(labels.first(), Some(&"www")) {
1018        return true;
1019    }
1020
1021    if labels.len() == 2 {
1022        let domain = labels[0];
1023        let tld = labels[1];
1024        if domain.len() >= 8 && matches!(tld, "org" | "edu" | "gov" | "mil" | "io" | "dev") {
1025            return true;
1026        }
1027    }
1028
1029    labels
1030        .iter()
1031        .take(labels.len().saturating_sub(1))
1032        .any(|label| {
1033            label.contains('-') && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 4
1034        })
1035}
1036
1037fn has_strong_binary_local_part(local: &str) -> bool {
1038    local
1039        .split(|c: char| !c.is_ascii_alphabetic())
1040        .any(|segment| segment.len() >= 3)
1041}
1042
1043fn has_strong_binary_host_shape(host: &str) -> bool {
1044    let labels: Vec<&str> = host.split('.').collect();
1045    if labels.len() < 2 {
1046        return false;
1047    }
1048
1049    let relevant = if matches!(labels.first(), Some(&"www" | &"ftp")) {
1050        &labels[1..]
1051    } else {
1052        &labels[..]
1053    };
1054
1055    if relevant.len() < 2 {
1056        return false;
1057    }
1058
1059    relevant[..relevant.len() - 1].iter().any(|label| {
1060        label.len() >= 3 && label.chars().filter(|c| c.is_ascii_alphabetic()).count() >= 3
1061    })
1062}
1063
1064fn extract_license_information(
1065    file_info_builder: &mut FileInfoBuilder,
1066    scan_errors: &mut Vec<String>,
1067    path: &Path,
1068    text_content: String,
1069    license_engine: Option<Arc<LicenseDetectionEngine>>,
1070    license_options: LicenseScanOptions,
1071    from_binary_strings: bool,
1072) -> Result<(), Error> {
1073    let Some(engine) = license_engine else {
1074        return Ok(());
1075    };
1076
1077    let detection_result = if license_options.min_score == 0 {
1078        engine.detect_with_kind_and_source(
1079            &text_content,
1080            license_options.unknown_licenses,
1081            from_binary_strings,
1082            &path.to_string_lossy(),
1083        )
1084    } else {
1085        engine.detect_with_kind_and_source_with_score(
1086            &text_content,
1087            license_options.unknown_licenses,
1088            from_binary_strings,
1089            &path.to_string_lossy(),
1090            license_options.min_score as f32,
1091        )
1092    };
1093
1094    match detection_result {
1095        Ok(detections) => {
1096            let query =
1097                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
1098            let mut model_detections = Vec::new();
1099            let mut model_clues = Vec::new();
1100
1101            for detection in &detections {
1102                let (public_detection, clue_matches) = convert_detection_to_model(
1103                    detection,
1104                    license_options,
1105                    &text_content,
1106                    query.as_ref(),
1107                );
1108
1109                if let Some(public_detection) = public_detection {
1110                    model_detections.push(public_detection);
1111                }
1112
1113                model_clues.extend(clue_matches);
1114            }
1115
1116            if !model_detections.is_empty() {
1117                let expressions: Vec<String> = model_detections
1118                    .iter()
1119                    .filter(|d| !d.license_expression_spdx.is_empty())
1120                    .map(|d| d.license_expression_spdx.clone())
1121                    .collect();
1122
1123                if !expressions.is_empty() {
1124                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
1125                    if let Some(expr) = combined {
1126                        file_info_builder.license_expression(Some(expr));
1127                    }
1128                }
1129            }
1130
1131            file_info_builder.license_detections(model_detections);
1132            file_info_builder.license_clues(model_clues);
1133            file_info_builder.percentage_of_license_text(
1134                query
1135                    .as_ref()
1136                    .map(|query| compute_percentage_of_license_text(query, &detections)),
1137            );
1138        }
1139        Err(e) => {
1140            scan_errors.push(format!("License detection failed: {}", e));
1141        }
1142    }
1143
1144    Ok(())
1145}
1146
1147fn convert_detection_to_model(
1148    detection: &crate::license_detection::LicenseDetection,
1149    license_options: LicenseScanOptions,
1150    text_content: &str,
1151    query: Option<&Query<'_>>,
1152) -> (Option<LicenseDetection>, Vec<Match>) {
1153    let matches: Vec<Match> = detection
1154        .matches
1155        .iter()
1156        .map(|m| convert_match_to_model(m, license_options, text_content, query))
1157        .collect();
1158
1159    if let Some(license_expression) = detection.license_expression.clone() {
1160        (
1161            Some(LicenseDetection {
1162                license_expression,
1163                license_expression_spdx: detection
1164                    .license_expression_spdx
1165                    .clone()
1166                    .unwrap_or_default(),
1167                matches,
1168                detection_log: if license_options.include_diagnostics {
1169                    detection.detection_log.clone()
1170                } else {
1171                    Vec::new()
1172                },
1173                identifier: detection.identifier.clone(),
1174            }),
1175            Vec::new(),
1176        )
1177    } else {
1178        (None, matches)
1179    }
1180}
1181
1182fn convert_match_to_model(
1183    m: &crate::license_detection::models::LicenseMatch,
1184    license_options: LicenseScanOptions,
1185    text_content: &str,
1186    query: Option<&Query<'_>>,
1187) -> Match {
1188    let output_metric = |value: f32| ((value as f64) * 100.0).round() / 100.0;
1189    let rule_url = if m.rule_url.is_empty() {
1190        None
1191    } else {
1192        Some(m.rule_url.clone())
1193    };
1194    let matched_text = if license_options.include_text {
1195        m.matched_text.clone().or_else(|| {
1196            Some(crate::license_detection::query::matched_text_from_text(
1197                text_content,
1198                m.start_line,
1199                m.end_line,
1200            ))
1201        })
1202    } else {
1203        None
1204    };
1205    let matched_text_diagnostics = if license_options.include_text_diagnostics {
1206        query.map(|query| matched_text_diagnostics_from_match(query, m))
1207    } else {
1208        None
1209    };
1210    Match {
1211        license_expression: m.license_expression.clone(),
1212        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
1213        from_file: m.from_file.clone(),
1214        start_line: m.start_line,
1215        end_line: m.end_line,
1216        matcher: Some(m.matcher.to_string()),
1217        score: output_metric(m.score),
1218        matched_length: Some(m.matched_length),
1219        match_coverage: Some(output_metric(m.coverage())),
1220        rule_relevance: Some(m.rule_relevance as usize),
1221        rule_identifier: Some(m.rule_identifier.clone()),
1222        rule_url,
1223        matched_text,
1224        referenced_filenames: m.referenced_filenames.clone(),
1225        matched_text_diagnostics,
1226    }
1227}
1228
1229fn compute_percentage_of_license_text(
1230    query: &Query<'_>,
1231    detections: &[crate::license_detection::LicenseDetection],
1232) -> f64 {
1233    let matched_positions: std::collections::HashSet<usize> = detections
1234        .iter()
1235        .flat_map(|detection| detection.matches.iter())
1236        .flat_map(|m| m.query_span().iter())
1237        .collect();
1238
1239    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
1240    if query_tokens_length == 0 {
1241        return 0.0;
1242    }
1243
1244    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
1245    (percentage * 100.0).round() / 100.0
1246}
1247
1248fn matched_text_diagnostics_from_match(
1249    query: &Query<'_>,
1250    license_match: &InternalLicenseMatch,
1251) -> String {
1252    let matched_positions: PositionSet = license_match.query_span().iter().collect();
1253    let Some(start_pos) = matched_positions.iter().min() else {
1254        return crate::license_detection::query::matched_text_from_text(
1255            &query.text,
1256            license_match.start_line,
1257            license_match.end_line,
1258        );
1259    };
1260    let Some(end_pos) = matched_positions.iter().max() else {
1261        return crate::license_detection::query::matched_text_from_text(
1262            &query.text,
1263            license_match.start_line,
1264            license_match.end_line,
1265        );
1266    };
1267
1268    crate::license_detection::query::matched_text_diagnostics_from_text(
1269        &query.text,
1270        query,
1271        &matched_positions,
1272        start_pos,
1273        end_pos,
1274        license_match.start_line,
1275        license_match.end_line,
1276    )
1277}
1278
1279fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
1280    is_pem_certificate_file(path, buffer)
1281}
1282
1283fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
1284    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
1285        return Ok(false);
1286    }
1287
1288    if path
1289        .file_name()
1290        .and_then(|name| name.to_str())
1291        .is_some_and(|name| name.ends_with("_test.go"))
1292    {
1293        return Ok(true);
1294    }
1295
1296    let content = fs::read_to_string(path)?;
1297    Ok(content.lines().take(10).any(|line| {
1298        let trimmed = line.trim();
1299        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
1300            && trimmed.split_whitespace().any(|token| token == "test")
1301    }))
1302}
1303
1304fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
1305    let prefix_len = buffer.len().min(8192);
1306    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
1307    let trimmed_lines: Vec<&str> = prefix
1308        .lines()
1309        .map(str::trim)
1310        .filter(|line| !line.is_empty())
1311        .take(64)
1312        .collect();
1313
1314    let Some(first_line) = trimmed_lines.first().copied() else {
1315        return false;
1316    };
1317
1318    PEM_CERTIFICATE_HEADERS
1319        .iter()
1320        .any(|(begin, end)| first_line == *begin && trimmed_lines.iter().any(|line| line == end))
1321}
1322
1323fn process_directory(
1324    path: &Path,
1325    _metadata: &fs::Metadata,
1326    collect_info: bool,
1327    license_enabled: bool,
1328) -> FileInfo {
1329    let name = path
1330        .file_name()
1331        .unwrap_or_default()
1332        .to_string_lossy()
1333        .to_string();
1334    let base_name = name.clone(); // For directories, base_name is the same as name
1335
1336    FileInfo {
1337        name,
1338        base_name,
1339        extension: "".to_string(),
1340        path: path.to_string_lossy().to_string(),
1341        file_type: FileType::Directory,
1342        mime_type: None,
1343        file_type_label: None,
1344        size: 0,
1345        date: None,
1346        sha1: None,
1347        md5: None,
1348        sha256: None,
1349        sha1_git: None,
1350        programming_language: None,
1351        package_data: Vec::new(),
1352        license_expression: None,
1353        license_detections: Vec::new(),
1354        license_clues: Vec::new(),
1355        percentage_of_license_text: license_enabled.then_some(0.0),
1356        copyrights: Vec::new(),
1357        holders: Vec::new(),
1358        authors: Vec::new(),
1359        emails: Vec::new(),
1360        urls: Vec::new(),
1361        for_packages: Vec::new(),
1362        scan_errors: Vec::new(),
1363        license_policy: None,
1364        is_binary: collect_info.then_some(false),
1365        is_text: collect_info.then_some(false),
1366        is_archive: collect_info.then_some(false),
1367        is_media: collect_info.then_some(false),
1368        is_source: collect_info.then_some(false),
1369        is_script: collect_info.then_some(false),
1370        files_count: collect_info.then_some(0),
1371        dirs_count: collect_info.then_some(0),
1372        size_count: collect_info.then_some(0),
1373        source_count: None,
1374        is_legal: false,
1375        is_manifest: false,
1376        is_readme: false,
1377        is_top_level: false,
1378        is_key_file: false,
1379        is_community: false,
1380        is_generated: None,
1381        facets: vec![],
1382        tallies: None,
1383    }
1384}
1385
1386#[cfg(test)]
1387mod tests {
1388    use super::{
1389        compute_percentage_of_license_text, convert_detection_to_model,
1390        extract_email_url_information, extract_named_author_from_binary_line,
1391        is_binary_string_author_candidate, is_binary_string_copyright_candidate,
1392        is_binary_string_email_candidate, is_binary_string_url_candidate,
1393        is_go_non_production_source,
1394    };
1395    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
1396    use crate::license_detection::index::LicenseIndex;
1397    use crate::license_detection::index::dictionary::TokenDictionary;
1398    use crate::license_detection::models::position_span::PositionSpan;
1399    use crate::license_detection::models::{LicenseMatch, MatchCoordinates, MatcherKind, RuleKind};
1400    use crate::license_detection::query::Query;
1401    use crate::models::{FileInfoBuilder, FileType};
1402    use crate::scanner::scan_options_fingerprint;
1403    use crate::scanner::{LicenseScanOptions, TextDetectionOptions};
1404    use std::fs;
1405    use tempfile::tempdir;
1406
1407    fn make_internal_match(rule_url: &str) -> LicenseMatch {
1408        LicenseMatch {
1409            rid: 0,
1410            license_expression: "mit".to_string(),
1411            license_expression_spdx: Some("MIT".to_string()),
1412            from_file: None,
1413            start_line: 1,
1414            end_line: 1,
1415            start_token: 0,
1416            end_token: 1,
1417            matcher: MatcherKind::Hash,
1418            score: 1.0,
1419            matched_length: 3,
1420            rule_length: 3,
1421            match_coverage: 100.0,
1422            rule_relevance: 100,
1423            rule_identifier: "mit.LICENSE".to_string(),
1424            rule_url: rule_url.to_string(),
1425            matched_text: Some("MIT".to_string()),
1426            referenced_filenames: None,
1427            rule_kind: RuleKind::Text,
1428            is_from_license: true,
1429            rule_start_token: 0,
1430            coordinates: MatchCoordinates::query_region(PositionSpan::empty()),
1431            candidate_resemblance: 0.0,
1432            candidate_containment: 0.0,
1433        }
1434    }
1435
1436    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
1437        InternalLicenseDetection {
1438            license_expression: Some("mit".to_string()),
1439            license_expression_spdx: Some("MIT".to_string()),
1440            matches: vec![make_internal_match(rule_url)],
1441            detection_log: vec![],
1442            identifier: Some("mit-test".to_string()),
1443            file_regions: Vec::new(),
1444        }
1445    }
1446
1447    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
1448        let dictionary = TokenDictionary::new_with_legalese(entries);
1449        let mut index = LicenseIndex::new(dictionary);
1450        index.len_legalese = len_legalese;
1451        index
1452    }
1453
1454    #[test]
1455    fn test_convert_detection_to_model_preserves_rule_url() {
1456        let detection = make_detection(
1457            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
1458        );
1459
1460        let (converted, clues) =
1461            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1462        let converted = converted.expect("detection should convert");
1463
1464        assert_eq!(
1465            converted.matches[0].rule_url.as_deref(),
1466            Some(
1467                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
1468            )
1469        );
1470        assert!(clues.is_empty());
1471    }
1472
1473    #[test]
1474    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
1475        let detection = make_detection("");
1476
1477        let (converted, clues) =
1478            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1479        let converted = converted.expect("detection should convert");
1480
1481        assert_eq!(converted.matches[0].rule_url, None);
1482        assert!(clues.is_empty());
1483    }
1484
1485    #[test]
1486    fn test_convert_detection_to_model_rounds_match_coverage() {
1487        let mut detection = make_detection("");
1488        detection.matches[0].score = 81.82;
1489        detection.matches[0].match_coverage = 33.334;
1490
1491        let (converted, clues) =
1492            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
1493        let converted = converted.expect("detection should convert");
1494
1495        assert_eq!(converted.matches[0].score, 81.82);
1496        assert_eq!(converted.matches[0].match_coverage, Some(33.33));
1497        assert!(clues.is_empty());
1498    }
1499
1500    #[test]
1501    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
1502        let mut detection = make_detection(
1503            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
1504        );
1505        detection.license_expression = None;
1506        detection.license_expression_spdx = None;
1507        detection.identifier = None;
1508        detection.matches[0].license_expression = "unknown-license-reference".to_string();
1509        detection.matches[0].license_expression_spdx =
1510            Some("LicenseRef-scancode-unknown-license-reference".to_string());
1511        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
1512        detection.matches[0].rule_kind = RuleKind::Clue;
1513
1514        let (converted, clues) = convert_detection_to_model(
1515            &detection,
1516            LicenseScanOptions {
1517                include_text: true,
1518                min_score: 0,
1519                ..LicenseScanOptions::default()
1520            },
1521            "clue text",
1522            None,
1523        );
1524
1525        assert!(converted.is_none());
1526        assert_eq!(clues.len(), 1);
1527        assert_eq!(clues[0].license_expression, "unknown-license-reference");
1528        assert_eq!(
1529            clues[0].license_expression_spdx,
1530            "LicenseRef-scancode-unknown-license-reference"
1531        );
1532        assert_eq!(
1533            clues[0].rule_identifier.as_deref(),
1534            Some("license-clue_1.RULE")
1535        );
1536        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
1537        assert_eq!(clues[0].matched_text_diagnostics, None);
1538    }
1539
1540    #[test]
1541    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
1542        let text = concat!(
1543            "Reproduction and distribution of this file, with or without modification, are\n",
1544            "permitted in any medium without royalties provided the copyright notice\n",
1545            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
1546        );
1547        let index = create_test_index(
1548            &[
1549                ("reproduction", 0),
1550                ("distribution", 1),
1551                ("file", 2),
1552                ("without", 3),
1553                ("modification", 4),
1554                ("permitted", 5),
1555                ("medium", 6),
1556                ("royalties", 7),
1557                ("provided", 8),
1558                ("copyright", 9),
1559                ("notice", 10),
1560                ("preserved", 11),
1561                ("offered", 12),
1562                ("warranties", 13),
1563            ],
1564            14,
1565        );
1566        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1567        let mut detection = make_detection(
1568            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1569        );
1570        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1571        detection.matches[0].license_expression = "fsf-ap".to_string();
1572        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1573        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1574        detection.matches[0].matched_text = None;
1575        detection.matches[0].start_line = 1;
1576        detection.matches[0].end_line = 3;
1577        detection.matches[0].start_token = 0;
1578        detection.matches[0].end_token = query.tokens.len();
1579        detection.matches[0].coordinates =
1580            MatchCoordinates::query_region(PositionSpan::from_positions(
1581                query
1582                    .tokens
1583                    .iter()
1584                    .enumerate()
1585                    .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1586                    .collect::<Vec<_>>(),
1587            ));
1588        detection.identifier = Some("fsf_ap-test".to_string());
1589
1590        let (converted, clues) = convert_detection_to_model(
1591            &detection,
1592            LicenseScanOptions {
1593                include_text: true,
1594                include_text_diagnostics: true,
1595                include_diagnostics: true,
1596                unknown_licenses: false,
1597                min_score: 0,
1598            },
1599            text,
1600            Some(&query),
1601        );
1602        let converted = converted.expect("detection should convert");
1603
1604        assert!(clues.is_empty());
1605        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1606        assert_eq!(
1607            converted.matches[0].matched_text.as_deref(),
1608            Some(text.trim_end())
1609        );
1610        let diagnostics = converted.matches[0]
1611            .matched_text_diagnostics
1612            .as_deref()
1613            .expect("diagnostics should be present");
1614        assert!(diagnostics.contains('['));
1615        assert!(diagnostics.contains(']'));
1616        assert_ne!(diagnostics, text.trim_end());
1617    }
1618
1619    #[test]
1620    fn test_extract_email_url_information_skips_binary_string_text() {
1621        let mut builder = FileInfoBuilder::default();
1622        let options = TextDetectionOptions {
1623            collect_info: false,
1624            detect_packages: false,
1625            detect_application_packages: false,
1626            detect_system_packages: false,
1627            detect_packages_in_compiled: false,
1628            detect_copyrights: false,
1629            detect_generated: false,
1630            detect_emails: true,
1631            detect_urls: true,
1632            max_emails: 50,
1633            max_urls: 50,
1634            timeout_seconds: 120.0,
1635        };
1636
1637        extract_email_url_information(
1638            &mut builder,
1639            "contact 6h@fo.lwft and visit http://gmail.com/",
1640            &options,
1641            true,
1642        );
1643
1644        let file = builder
1645            .name("binary.bin".to_string())
1646            .base_name("binary".to_string())
1647            .extension(".bin".to_string())
1648            .path("binary.bin".to_string())
1649            .file_type(FileType::File)
1650            .size(1)
1651            .build()
1652            .expect("builder should produce file info");
1653
1654        assert!(file.emails.is_empty(), "emails: {:?}", file.emails);
1655        assert!(file.urls.is_empty(), "urls: {:?}", file.urls);
1656    }
1657
1658    #[test]
1659    fn test_extract_email_url_information_keeps_good_binary_contacts() {
1660        let mut builder = FileInfoBuilder::default();
1661        let options = TextDetectionOptions {
1662            collect_info: false,
1663            detect_packages: false,
1664            detect_application_packages: false,
1665            detect_system_packages: false,
1666            detect_packages_in_compiled: false,
1667            detect_copyrights: false,
1668            detect_generated: false,
1669            detect_emails: true,
1670            detect_urls: true,
1671            max_emails: 50,
1672            max_urls: 50,
1673            timeout_seconds: 120.0,
1674        };
1675
1676        extract_email_url_information(
1677            &mut builder,
1678            "report bugs to bug-coreutils@gnu.org and see https://www.gnu.org/software/coreutils/",
1679            &options,
1680            true,
1681        );
1682
1683        let file = builder
1684            .name("binary.bin".to_string())
1685            .base_name("binary".to_string())
1686            .extension(".bin".to_string())
1687            .path("binary.bin".to_string())
1688            .file_type(FileType::File)
1689            .size(1)
1690            .build()
1691            .expect("builder should produce file info");
1692
1693        assert_eq!(file.emails.len(), 1, "emails: {:?}", file.emails);
1694        assert_eq!(file.emails[0].email, "bug-coreutils@gnu.org");
1695        assert_eq!(file.urls.len(), 1, "urls: {:?}", file.urls);
1696        assert_eq!(file.urls[0].url, "https://www.gnu.org/software/coreutils/");
1697    }
1698
1699    #[test]
1700    fn test_extract_email_url_information_deduplicates_binary_emails_before_cap() {
1701        let mut builder = FileInfoBuilder::default();
1702        let options = TextDetectionOptions {
1703            collect_info: false,
1704            detect_packages: false,
1705            detect_application_packages: false,
1706            detect_system_packages: false,
1707            detect_packages_in_compiled: false,
1708            detect_copyrights: false,
1709            detect_generated: false,
1710            detect_emails: true,
1711            detect_urls: false,
1712            max_emails: 2,
1713            max_urls: 50,
1714            timeout_seconds: 120.0,
1715        };
1716
1717        extract_email_url_information(
1718            &mut builder,
1719            "first jakub@redhat.com second jakub@redhat.com third contyk@redhat.com",
1720            &options,
1721            true,
1722        );
1723
1724        let file = builder
1725            .name("binary.bin".to_string())
1726            .base_name("binary".to_string())
1727            .extension(".bin".to_string())
1728            .path("binary.bin".to_string())
1729            .file_type(FileType::File)
1730            .size(1)
1731            .build()
1732            .expect("builder should produce file info");
1733
1734        assert_eq!(file.emails.len(), 2, "emails: {:?}", file.emails);
1735        assert_eq!(file.emails[0].email, "jakub@redhat.com");
1736        assert_eq!(file.emails[1].email, "contyk@redhat.com");
1737    }
1738
1739    #[test]
1740    fn test_binary_string_copyright_candidate_rejects_gibberish_holder_text() {
1741        let gibberish = "(c) S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9) M0@9s J'@y DH@9Ih@y";
1742        assert!(!is_binary_string_copyright_candidate(gibberish));
1743    }
1744
1745    #[test]
1746    fn test_binary_string_copyright_candidate_keeps_real_notice() {
1747        let notice = "Copyright nexB and others (c) 2012";
1748        assert!(is_binary_string_copyright_candidate(notice));
1749    }
1750
1751    #[test]
1752    fn test_binary_string_copyright_candidate_rejects_changelog_phrase() {
1753        assert!(!is_binary_string_copyright_candidate(
1754            "Copyright - split out libs"
1755        ));
1756    }
1757
1758    #[test]
1759    fn test_binary_string_email_candidate_rejects_gibberish() {
1760        assert!(!is_binary_string_email_candidate("6h@fo.lwft"));
1761    }
1762
1763    #[test]
1764    fn test_binary_string_email_candidate_keeps_gnu_bug_address() {
1765        assert!(is_binary_string_email_candidate("bug-coreutils@gnu.org"));
1766    }
1767
1768    #[test]
1769    fn test_binary_string_url_candidate_rejects_short_fake_host() {
1770        assert!(!is_binary_string_url_candidate("http://ftp.so/"));
1771    }
1772
1773    #[test]
1774    fn test_binary_string_url_candidate_keeps_gnu_help_url() {
1775        assert!(is_binary_string_url_candidate(
1776            "https://www.gnu.org/software/coreutils/"
1777        ));
1778    }
1779
1780    #[test]
1781    fn test_binary_string_url_candidate_rejects_bare_root_domain() {
1782        assert!(!is_binary_string_url_candidate("http://gmail.com/"));
1783    }
1784
1785    #[test]
1786    fn test_binary_string_url_candidate_keeps_project_subdomain_root() {
1787        assert!(is_binary_string_url_candidate("http://gcc.gnu.org"));
1788    }
1789
1790    #[test]
1791    fn test_binary_string_url_candidate_keeps_long_org_root_domain() {
1792        assert!(is_binary_string_url_candidate("https://publicsuffix.org/"));
1793    }
1794
1795    #[test]
1796    fn test_binary_string_url_candidate_keeps_short_project_path() {
1797        assert!(is_binary_string_url_candidate("http://tukaani.org/xz/"));
1798    }
1799
1800    #[test]
1801    fn test_binary_string_author_candidate_keeps_named_author_with_email() {
1802        assert!(is_binary_string_author_candidate(
1803            "Andreas Schneider <asn@redhat.com>"
1804        ));
1805    }
1806
1807    #[test]
1808    fn test_binary_string_author_candidate_rejects_gibberish() {
1809        assert!(!is_binary_string_author_candidate(
1810            "S8@9 K @9 D @9 I,@9N(@ F@@9L,@ HD@9"
1811        ));
1812    }
1813
1814    #[test]
1815    fn test_binary_string_author_candidate_rejects_changelog_phrase() {
1816        assert!(!is_binary_string_author_candidate(
1817            "Developers can enable them. - revert news user back to"
1818        ));
1819    }
1820
1821    #[test]
1822    fn test_extract_named_author_from_binary_line_recovers_by_prefix() {
1823        assert_eq!(
1824            extract_named_author_from_binary_line("Patch by Andreas Schneider <asn@redhat.com>"),
1825            Some("Andreas Schneider <asn@redhat.com>".to_string())
1826        );
1827    }
1828
1829    #[test]
1830    fn test_extract_named_author_from_binary_line_recovers_parenthesized_email() {
1831        assert_eq!(
1832            extract_named_author_from_binary_line(
1833                "same for both OpenSSL and NSS by Rob Crittenden (rcritten@redhat.com)"
1834            ),
1835            Some("Rob Crittenden (rcritten@redhat.com)".to_string())
1836        );
1837    }
1838
1839    #[test]
1840    fn test_extract_named_author_from_binary_line_rejects_plain_changelog_packager_line() {
1841        assert_eq!(
1842            extract_named_author_from_binary_line(
1843                "Rob Crittenden <rcritten@redhat.com> - 3.11.7-9"
1844            ),
1845            None
1846        );
1847    }
1848
1849    #[test]
1850    fn test_extract_named_author_from_binary_line_keeps_email_only_review_author() {
1851        assert_eq!(
1852            extract_named_author_from_binary_line(
1853                "Changes as per initial review by panemade@gmail.com"
1854            ),
1855            Some("panemade@gmail.com".to_string())
1856        );
1857    }
1858
1859    #[test]
1860    fn test_binary_string_author_candidate_rejects_multiple_emails_on_one_line() {
1861        assert!(!is_binary_string_author_candidate(
1862            "Rob Crittenden (rcritten@redhat.com) jakub@redhat.com"
1863        ));
1864    }
1865
1866    #[test]
1867    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1868        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1869        let text = "alpha MIT omega";
1870        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1871        let mut detection = make_detection("");
1872        detection.matches[0].coordinates =
1873            MatchCoordinates::query_region(PositionSpan::from_positions(vec![1]));
1874        detection.matches[0].start_token = 1;
1875        detection.matches[0].end_token = 2;
1876
1877        let percentage = compute_percentage_of_license_text(&query, &[detection]);
1878
1879        assert_eq!(percentage, 33.33);
1880    }
1881
1882    #[test]
1883    fn test_scan_options_fingerprint_changes_with_license_score() {
1884        let text_options = crate::scanner::TextDetectionOptions::default();
1885        let default_fingerprint = scan_options_fingerprint(
1886            &text_options,
1887            LicenseScanOptions {
1888                min_score: 0,
1889                ..LicenseScanOptions::default()
1890            },
1891            None,
1892        );
1893        let filtered_fingerprint = scan_options_fingerprint(
1894            &text_options,
1895            LicenseScanOptions {
1896                min_score: 70,
1897                ..LicenseScanOptions::default()
1898            },
1899            None,
1900        );
1901
1902        assert_ne!(default_fingerprint, filtered_fingerprint);
1903    }
1904
1905    #[test]
1906    fn test_is_go_non_production_source_for_test_filename() {
1907        let temp_dir = tempdir().unwrap();
1908        let path = temp_dir.path().join("scanner_test.go");
1909        fs::write(&path, "package scanner\n").unwrap();
1910
1911        assert!(is_go_non_production_source(&path).unwrap());
1912    }
1913
1914    #[test]
1915    fn test_is_go_non_production_source_for_build_tag() {
1916        let temp_dir = tempdir().unwrap();
1917        let path = temp_dir.path().join("scanner.go");
1918        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1919
1920        assert!(is_go_non_production_source(&path).unwrap());
1921    }
1922
1923    #[test]
1924    fn test_is_go_non_production_source_for_regular_go_file() {
1925        let temp_dir = tempdir().unwrap();
1926        let path = temp_dir.path().join("scanner.go");
1927        fs::write(&path, "package scanner\n").unwrap();
1928
1929        assert!(!is_go_non_production_source(&path).unwrap());
1930    }
1931}