Skip to main content

provenant/scanner/
process.rs

1use crate::license_detection::LicenseDetectionEngine;
2use crate::parsers::try_parse_file;
3use crate::utils::hash::{calculate_md5, calculate_sha1, calculate_sha256};
4use crate::utils::language::detect_language;
5use crate::utils::text::{is_source, remove_verbatim_escape_sequences};
6use anyhow::Error;
7use mime_guess::from_path;
8use rayon::prelude::*;
9use std::fs::{self};
10use std::path::Path;
11use std::sync::Arc;
12use std::time::{Duration, Instant};
13
14use crate::cache::{CachedScanFindings, read_cached_findings, write_cached_findings};
15use crate::copyright::{
16    self, AuthorDetection, CopyrightDetection, CopyrightDetectionOptions, HolderDetection,
17};
18use crate::finder::{self, DetectionConfig};
19use crate::license_detection::models::LicenseMatch as InternalLicenseMatch;
20use crate::license_detection::query::Query;
21use crate::models::{
22    Author, Copyright, FileInfo, FileInfoBuilder, FileType, Holder, LicenseDetection, Match,
23    OutputEmail, OutputURL,
24};
25use crate::progress::ScanProgress;
26use crate::scanner::collect::CollectedPaths;
27use crate::scanner::{LicenseScanOptions, ProcessResult, TextDetectionOptions};
28use crate::utils::file::{ExtractedTextKind, extract_text_for_detection, get_creation_date};
29use crate::utils::generated::generated_code_hints_from_bytes;
30
31const PEM_CERTIFICATE_HEADERS: &[(&str, &str)] = &[
32    ("-----BEGIN CERTIFICATE-----", "-----END CERTIFICATE-----"),
33    (
34        "-----BEGIN TRUSTED CERTIFICATE-----",
35        "-----END TRUSTED CERTIFICATE-----",
36    ),
37];
38
39pub fn process_collected(
40    collected: &CollectedPaths,
41    progress: Arc<ScanProgress>,
42    license_engine: Option<Arc<LicenseDetectionEngine>>,
43    license_options: LicenseScanOptions,
44    text_options: &TextDetectionOptions,
45) -> ProcessResult {
46    let mut all_files: Vec<FileInfo> = collected
47        .files
48        .par_iter()
49        .map(|(path, metadata)| {
50            let file_entry = process_file(
51                path,
52                metadata,
53                license_engine.clone(),
54                license_options,
55                text_options,
56            );
57            progress.file_completed(path, metadata.len(), &file_entry.scan_errors);
58            file_entry
59        })
60        .collect();
61
62    for (path, metadata) in &collected.directories {
63        all_files.push(process_directory(
64            path,
65            metadata,
66            text_options.collect_info,
67            license_engine.is_some(),
68        ));
69    }
70
71    ProcessResult {
72        files: all_files,
73        excluded_count: collected.excluded_count,
74    }
75}
76
77fn process_file(
78    path: &Path,
79    metadata: &fs::Metadata,
80    license_engine: Option<Arc<LicenseDetectionEngine>>,
81    license_options: LicenseScanOptions,
82    text_options: &TextDetectionOptions,
83) -> FileInfo {
84    let mut scan_errors: Vec<String> = vec![];
85    let mut file_info_builder = FileInfoBuilder::default();
86    let license_enabled = license_engine.is_some();
87
88    let started = Instant::now();
89
90    let mut generated_flag = None;
91    match extract_information_from_content(
92        &mut file_info_builder,
93        &mut scan_errors,
94        path,
95        license_engine,
96        license_options,
97        text_options,
98    ) {
99        Ok(is_generated) => generated_flag = is_generated,
100        Err(e) => scan_errors.push(e.to_string()),
101    };
102
103    if is_timeout_exceeded(started, text_options.timeout_seconds) {
104        scan_errors.push(format!(
105            "Processing interrupted due to timeout after {:.2} seconds",
106            text_options.timeout_seconds
107        ));
108    }
109
110    let mut file_info = file_info_builder
111        .name(path.file_name().unwrap().to_string_lossy().to_string())
112        .base_name(
113            path.file_stem()
114                .unwrap_or_default()
115                .to_string_lossy()
116                .to_string(),
117        )
118        .extension(
119            path.extension()
120                .map_or("".to_string(), |ext| format!(".{}", ext.to_string_lossy())),
121        )
122        .path(path.to_string_lossy().to_string())
123        .file_type(FileType::File)
124        .mime_type(Some(
125            from_path(path)
126                .first_or_octet_stream()
127                .essence_str()
128                .to_string(),
129        ))
130        .size(metadata.len())
131        .date(get_creation_date(metadata))
132        .scan_errors(scan_errors)
133        .build()
134        .expect("FileInformationBuild not completely initialized");
135
136    if text_options.collect_info {
137        file_info.is_source = Some(is_source(path));
138    }
139
140    if file_info.programming_language.as_deref() == Some("Go")
141        && is_go_non_production_source(path).unwrap_or(false)
142    {
143        file_info.is_source = Some(false);
144    }
145
146    if text_options.detect_generated {
147        file_info.is_generated = Some(generated_flag.unwrap_or(false));
148    }
149
150    if file_info.percentage_of_license_text.is_none() && license_enabled {
151        file_info.percentage_of_license_text = Some(0.0);
152    }
153
154    if let (Some(scan_results_dir), Some(sha256)) = (
155        text_options.scan_cache_dir.as_deref(),
156        file_info.sha256.as_deref(),
157    ) && file_info.scan_errors.is_empty()
158    {
159        let findings = CachedScanFindings::from_file_info(&file_info);
160        let options_fingerprint =
161            scan_cache_fingerprint(text_options, license_options, license_enabled);
162        if let Err(err) =
163            write_cached_findings(scan_results_dir, sha256, &options_fingerprint, &findings)
164        {
165            file_info
166                .scan_errors
167                .push(format!("Failed to write scan cache entry: {err}"));
168        }
169    }
170
171    file_info
172}
173
174fn extract_information_from_content(
175    file_info_builder: &mut FileInfoBuilder,
176    scan_errors: &mut Vec<String>,
177    path: &Path,
178    license_engine: Option<Arc<LicenseDetectionEngine>>,
179    license_options: LicenseScanOptions,
180    text_options: &TextDetectionOptions,
181) -> Result<Option<bool>, Error> {
182    let started = Instant::now();
183    let buffer = fs::read(path)?;
184    let license_enabled = license_engine.is_some();
185
186    if is_timeout_exceeded(started, text_options.timeout_seconds) {
187        return Err(Error::msg(format!(
188            "Timeout while reading file content (> {:.2}s)",
189            text_options.timeout_seconds
190        )));
191    }
192
193    let sha256 = calculate_sha256(&buffer);
194    let is_generated = text_options
195        .detect_generated
196        .then(|| !generated_code_hints_from_bytes(&buffer).is_empty());
197
198    file_info_builder
199        .sha1(Some(calculate_sha1(&buffer)))
200        .md5(Some(calculate_md5(&buffer)))
201        .sha256(Some(sha256.clone()))
202        .programming_language(Some(detect_language(path, &buffer)));
203
204    if should_skip_text_detection(path, &buffer) {
205        return Ok(is_generated);
206    }
207
208    if let Some(scan_results_dir) = text_options.scan_cache_dir.as_deref() {
209        let options_fingerprint =
210            scan_cache_fingerprint(text_options, license_options, license_enabled);
211        match read_cached_findings(scan_results_dir, &sha256, &options_fingerprint) {
212            Ok(Some(findings)) => {
213                file_info_builder
214                    .package_data(findings.package_data)
215                    .license_expression(findings.license_expression)
216                    .license_detections(findings.license_detections)
217                    .license_clues(findings.license_clues)
218                    .percentage_of_license_text(findings.percentage_of_license_text)
219                    .copyrights(findings.copyrights)
220                    .holders(findings.holders)
221                    .authors(findings.authors)
222                    .emails(findings.emails)
223                    .urls(findings.urls)
224                    .programming_language(findings.programming_language);
225                return Ok(is_generated);
226            }
227            Ok(None) => {}
228            Err(err) => {
229                scan_errors.push(format!("Failed to read scan cache for {:?}: {}", path, err));
230            }
231        }
232    }
233
234    // Package parsing and text-based detection (copyright, license) are independent.
235    // Python ScanCode runs all enabled plugins on every file, so we do the same.
236    if text_options.detect_packages
237        && let Some(parse_result) = try_parse_file(path)
238    {
239        file_info_builder.package_data(parse_result.packages);
240        scan_errors.extend(parse_result.scan_errors);
241    }
242
243    if is_timeout_exceeded(started, text_options.timeout_seconds) {
244        return Err(Error::msg(format!(
245            "Timeout while extracting package/text metadata (> {:.2}s)",
246            text_options.timeout_seconds
247        )));
248    }
249
250    let (text_content, text_kind) = extract_text_for_detection(path, &buffer);
251    let from_binary_strings = matches!(text_kind, ExtractedTextKind::BinaryStrings);
252
253    if is_timeout_exceeded(started, text_options.timeout_seconds) {
254        return Err(Error::msg(format!(
255            "Timeout while extracting text content (> {:.2}s)",
256            text_options.timeout_seconds
257        )));
258    }
259
260    if text_content.is_empty() {
261        return Ok(is_generated);
262    }
263
264    if text_options.detect_copyrights {
265        extract_copyright_information(
266            file_info_builder,
267            path,
268            &text_content,
269            text_options.timeout_seconds,
270            from_binary_strings,
271        );
272    }
273    extract_email_url_information(file_info_builder, &text_content, text_options);
274
275    if is_timeout_exceeded(started, text_options.timeout_seconds) {
276        return Err(Error::msg(format!(
277            "Timeout before license scan (> {:.2}s)",
278            text_options.timeout_seconds
279        )));
280    }
281    // Handle source map files specially
282    let text_content_for_license_detection = if crate::utils::sourcemap::is_sourcemap(path) {
283        if let Some(sourcemap_content) =
284            crate::utils::sourcemap::extract_sourcemap_content(&text_content)
285        {
286            sourcemap_content
287        } else {
288            text_content
289        }
290    } else if is_source(path) {
291        remove_verbatim_escape_sequences(&text_content)
292    } else {
293        text_content
294    };
295
296    extract_license_information(
297        file_info_builder,
298        scan_errors,
299        path,
300        text_content_for_license_detection,
301        license_engine,
302        license_options,
303        from_binary_strings,
304    )?;
305
306    Ok(is_generated)
307}
308
309fn is_timeout_exceeded(started: Instant, timeout_seconds: f64) -> bool {
310    timeout_seconds.is_finite()
311        && timeout_seconds > 0.0
312        && started.elapsed().as_secs_f64() > timeout_seconds
313}
314
315fn scan_cache_fingerprint(
316    text_options: &TextDetectionOptions,
317    license_options: LicenseScanOptions,
318    license_enabled: bool,
319) -> String {
320    format!(
321        "packages={};copyrights={};emails={};urls={};max_emails={};max_urls={};timeout={:.6};license_enabled={};license_text={};license_text_diagnostics={};license_diagnostics={};unknown_licenses={}",
322        text_options.detect_packages,
323        text_options.detect_copyrights,
324        text_options.detect_emails,
325        text_options.detect_urls,
326        text_options.max_emails,
327        text_options.max_urls,
328        text_options.timeout_seconds,
329        license_enabled,
330        license_options.include_text,
331        license_options.include_text_diagnostics,
332        license_options.include_diagnostics,
333        license_options.unknown_licenses,
334    )
335}
336
337fn extract_copyright_information(
338    file_info_builder: &mut FileInfoBuilder,
339    path: &Path,
340    text_content: &str,
341    timeout_seconds: f64,
342    from_binary_strings: bool,
343) {
344    // CREDITS files get special handling (Linux kernel style).
345    if copyright::is_credits_file(path) {
346        let author_detections = copyright::detect_credits_authors(text_content);
347        if !author_detections.is_empty() {
348            file_info_builder.authors(
349                author_detections
350                    .into_iter()
351                    .map(|a| Author {
352                        author: a.author,
353                        start_line: a.start_line,
354                        end_line: a.end_line,
355                    })
356                    .collect(),
357            );
358            return;
359        }
360    }
361
362    let copyright_options = CopyrightDetectionOptions {
363        max_runtime: if timeout_seconds.is_finite() && timeout_seconds > 0.0 {
364            Some(Duration::from_secs_f64(timeout_seconds))
365        } else {
366            None
367        },
368        ..CopyrightDetectionOptions::default()
369    };
370
371    let (copyrights, holders, authors) =
372        copyright::detect_copyrights_with_options(text_content, &copyright_options);
373    let (copyrights, holders, authors) = if from_binary_strings {
374        prune_binary_string_detections(copyrights, holders, authors)
375    } else {
376        (copyrights, holders, authors)
377    };
378
379    file_info_builder.copyrights(
380        copyrights
381            .into_iter()
382            .map(|c| Copyright {
383                copyright: c.copyright,
384                start_line: c.start_line,
385                end_line: c.end_line,
386            })
387            .collect::<Vec<Copyright>>(),
388    );
389    file_info_builder.holders(
390        holders
391            .into_iter()
392            .map(|h| Holder {
393                holder: h.holder,
394                start_line: h.start_line,
395                end_line: h.end_line,
396            })
397            .collect::<Vec<Holder>>(),
398    );
399    file_info_builder.authors(
400        authors
401            .into_iter()
402            .map(|a| Author {
403                author: a.author,
404                start_line: a.start_line,
405                end_line: a.end_line,
406            })
407            .collect::<Vec<Author>>(),
408    );
409}
410
411fn prune_binary_string_detections(
412    copyrights: Vec<CopyrightDetection>,
413    holders: Vec<HolderDetection>,
414    _authors: Vec<AuthorDetection>,
415) -> (
416    Vec<CopyrightDetection>,
417    Vec<HolderDetection>,
418    Vec<AuthorDetection>,
419) {
420    let kept_copyrights: Vec<CopyrightDetection> = copyrights
421        .into_iter()
422        .filter(|c| is_binary_string_copyright_candidate(&c.copyright))
423        .collect();
424
425    let kept_holders: Vec<HolderDetection> = holders
426        .into_iter()
427        .filter(|holder| {
428            kept_copyrights.iter().any(|copyright| {
429                ranges_overlap(
430                    holder.start_line,
431                    holder.end_line,
432                    copyright.start_line,
433                    copyright.end_line,
434                )
435            })
436        })
437        .collect();
438
439    (kept_copyrights, kept_holders, Vec::new())
440}
441
442fn ranges_overlap(a_start: usize, a_end: usize, b_start: usize, b_end: usize) -> bool {
443    a_start <= b_end && b_start <= a_end
444}
445
446fn is_binary_string_copyright_candidate(text: &str) -> bool {
447    if has_explicit_copyright_marker(text) || contains_year(text) {
448        return true;
449    }
450
451    let lower = text.to_ascii_lowercase();
452    let Some(tail) = lower.strip_prefix("copyright") else {
453        return true;
454    };
455    let tail = tail.trim();
456    let alpha_tokens: Vec<&str> = tail
457        .split_whitespace()
458        .filter(|token| token.chars().any(|c| c.is_alphabetic()))
459        .collect();
460
461    if alpha_tokens.len() <= 1 {
462        return true;
463    }
464
465    if tail.contains(',') || tail.contains(" and ") || tail.contains('&') {
466        return true;
467    }
468
469    alpha_tokens
470        .iter()
471        .any(|token| is_company_like_suffix(token.trim_matches(|c: char| !c.is_alphanumeric())))
472}
473
474fn has_explicit_copyright_marker(text: &str) -> bool {
475    let lower = text.to_ascii_lowercase();
476    lower.contains("(c)") || lower.contains('©') || lower.contains("copr")
477}
478
479fn contains_year(text: &str) -> bool {
480    let bytes = text.as_bytes();
481    bytes.windows(4).any(|window| {
482        window.iter().all(|b| b.is_ascii_digit())
483            && matches!(window[0], b'1' | b'2')
484            && matches!(window[1], b'9' | b'0')
485    })
486}
487
488fn is_company_like_suffix(token: &str) -> bool {
489    matches!(
490        token.to_ascii_lowercase().as_str(),
491        "inc"
492            | "corp"
493            | "corporation"
494            | "co"
495            | "company"
496            | "ltd"
497            | "llc"
498            | "gmbh"
499            | "foundation"
500            | "project"
501            | "systems"
502            | "software"
503            | "technologies"
504            | "technology"
505    )
506}
507
508fn extract_email_url_information(
509    file_info_builder: &mut FileInfoBuilder,
510    text_content: &str,
511    text_options: &TextDetectionOptions,
512) {
513    if !text_options.detect_emails && !text_options.detect_urls {
514        return;
515    }
516
517    if text_options.detect_emails {
518        let config = DetectionConfig {
519            max_emails: text_options.max_emails,
520            max_urls: text_options.max_urls,
521            unique: false,
522        };
523        let emails = finder::find_emails(text_content, &config)
524            .into_iter()
525            .map(|d| OutputEmail {
526                email: d.email,
527                start_line: d.start_line,
528                end_line: d.end_line,
529            })
530            .collect::<Vec<_>>();
531        file_info_builder.emails(emails);
532    }
533
534    if text_options.detect_urls {
535        let config = DetectionConfig {
536            max_emails: text_options.max_emails,
537            max_urls: text_options.max_urls,
538            unique: true,
539        };
540        let urls = finder::find_urls(text_content, &config)
541            .into_iter()
542            .map(|d| OutputURL {
543                url: d.url,
544                start_line: d.start_line,
545                end_line: d.end_line,
546            })
547            .collect::<Vec<_>>();
548        file_info_builder.urls(urls);
549    }
550}
551
552fn extract_license_information(
553    file_info_builder: &mut FileInfoBuilder,
554    scan_errors: &mut Vec<String>,
555    path: &Path,
556    text_content: String,
557    license_engine: Option<Arc<LicenseDetectionEngine>>,
558    license_options: LicenseScanOptions,
559    from_binary_strings: bool,
560) -> Result<(), Error> {
561    let Some(engine) = license_engine else {
562        return Ok(());
563    };
564
565    match engine.detect_with_kind_and_source(
566        &text_content,
567        license_options.unknown_licenses,
568        from_binary_strings,
569        &path.to_string_lossy(),
570    ) {
571        Ok(detections) => {
572            let query =
573                Query::from_extracted_text(&text_content, engine.index(), from_binary_strings).ok();
574            let mut model_detections = Vec::new();
575            let mut model_clues = Vec::new();
576
577            for detection in &detections {
578                let (public_detection, clue_matches) = convert_detection_to_model(
579                    detection,
580                    license_options,
581                    &text_content,
582                    query.as_ref(),
583                );
584
585                if let Some(public_detection) = public_detection {
586                    model_detections.push(public_detection);
587                }
588
589                model_clues.extend(clue_matches);
590            }
591
592            if !model_detections.is_empty() {
593                let expressions: Vec<String> = model_detections
594                    .iter()
595                    .filter(|d| !d.license_expression_spdx.is_empty())
596                    .map(|d| d.license_expression_spdx.clone())
597                    .collect();
598
599                if !expressions.is_empty() {
600                    let combined = crate::utils::spdx::combine_license_expressions(expressions);
601                    if let Some(expr) = combined {
602                        file_info_builder.license_expression(Some(expr));
603                    }
604                }
605            }
606
607            file_info_builder.license_detections(model_detections);
608            file_info_builder.license_clues(model_clues);
609            file_info_builder.percentage_of_license_text(
610                query
611                    .as_ref()
612                    .map(|query| compute_percentage_of_license_text(query, &detections)),
613            );
614        }
615        Err(e) => {
616            scan_errors.push(format!("License detection failed: {}", e));
617        }
618    }
619
620    Ok(())
621}
622
623fn convert_detection_to_model(
624    detection: &crate::license_detection::LicenseDetection,
625    license_options: LicenseScanOptions,
626    text_content: &str,
627    query: Option<&Query<'_>>,
628) -> (Option<LicenseDetection>, Vec<Match>) {
629    let matches: Vec<Match> = detection
630        .matches
631        .iter()
632        .map(|m| convert_match_to_model(m, license_options, text_content, query))
633        .collect();
634
635    if let Some(license_expression) = detection.license_expression.clone() {
636        (
637            Some(LicenseDetection {
638                license_expression,
639                license_expression_spdx: detection
640                    .license_expression_spdx
641                    .clone()
642                    .unwrap_or_default(),
643                matches,
644                detection_log: if license_options.include_diagnostics {
645                    detection.detection_log.clone()
646                } else {
647                    Vec::new()
648                },
649                identifier: detection.identifier.clone(),
650            }),
651            Vec::new(),
652        )
653    } else {
654        (None, matches)
655    }
656}
657
658fn convert_match_to_model(
659    m: &crate::license_detection::models::LicenseMatch,
660    license_options: LicenseScanOptions,
661    text_content: &str,
662    query: Option<&Query<'_>>,
663) -> Match {
664    let rule_url = if m.rule_url.is_empty() {
665        None
666    } else {
667        Some(m.rule_url.clone())
668    };
669    let matched_text = if license_options.include_text {
670        m.matched_text.clone().or_else(|| {
671            Some(crate::license_detection::query::matched_text_from_text(
672                text_content,
673                m.start_line,
674                m.end_line,
675            ))
676        })
677    } else {
678        None
679    };
680    let matched_text_diagnostics = if license_options.include_text_diagnostics {
681        query.map(|query| matched_text_diagnostics_from_match(query, m))
682    } else {
683        None
684    };
685    Match {
686        license_expression: m.license_expression.clone(),
687        license_expression_spdx: m.license_expression_spdx.clone().unwrap_or_default(),
688        from_file: m.from_file.clone(),
689        start_line: m.start_line,
690        end_line: m.end_line,
691        matcher: Some(m.matcher.to_string()),
692        score: m.score as f64,
693        matched_length: Some(m.matched_length),
694        match_coverage: Some(m.match_coverage as f64),
695        rule_relevance: Some(m.rule_relevance as usize),
696        rule_identifier: Some(m.rule_identifier.clone()),
697        rule_url,
698        matched_text,
699        referenced_filenames: m.referenced_filenames.clone(),
700        matched_text_diagnostics,
701    }
702}
703
704fn compute_percentage_of_license_text(
705    query: &Query<'_>,
706    detections: &[crate::license_detection::LicenseDetection],
707) -> f64 {
708    let matched_positions: std::collections::HashSet<usize> = detections
709        .iter()
710        .flat_map(|detection| detection.matches.iter())
711        .flat_map(InternalLicenseMatch::qspan)
712        .collect();
713
714    let query_tokens_length = query.tokens.len() + query.unknowns_by_pos.values().sum::<usize>();
715    if query_tokens_length == 0 {
716        return 0.0;
717    }
718
719    let percentage = (matched_positions.len() as f64 / query_tokens_length as f64) * 100.0;
720    (percentage * 100.0).round() / 100.0
721}
722
723fn matched_text_diagnostics_from_match(
724    query: &Query<'_>,
725    license_match: &InternalLicenseMatch,
726) -> String {
727    let matched_positions: std::collections::HashSet<usize> =
728        license_match.qspan().into_iter().collect();
729    let Some(start_pos) = matched_positions.iter().min().copied() else {
730        return crate::license_detection::query::matched_text_from_text(
731            &query.text,
732            license_match.start_line,
733            license_match.end_line,
734        );
735    };
736    let Some(end_pos) = matched_positions.iter().max().copied() else {
737        return crate::license_detection::query::matched_text_from_text(
738            &query.text,
739            license_match.start_line,
740            license_match.end_line,
741        );
742    };
743
744    crate::license_detection::query::matched_text_diagnostics_from_text(
745        &query.text,
746        query,
747        &matched_positions,
748        start_pos,
749        end_pos,
750        license_match.start_line,
751        license_match.end_line,
752    )
753}
754
755fn should_skip_text_detection(path: &Path, buffer: &[u8]) -> bool {
756    is_pem_certificate_file(path, buffer)
757}
758
759fn is_go_non_production_source(path: &Path) -> std::io::Result<bool> {
760    if path.extension().and_then(|ext| ext.to_str()) != Some("go") {
761        return Ok(false);
762    }
763
764    if path
765        .file_name()
766        .and_then(|name| name.to_str())
767        .is_some_and(|name| name.ends_with("_test.go"))
768    {
769        return Ok(true);
770    }
771
772    let content = fs::read_to_string(path)?;
773    Ok(content.lines().take(10).any(|line| {
774        let trimmed = line.trim();
775        (trimmed.starts_with("//go:build") || trimmed.starts_with("// +build"))
776            && trimmed.split_whitespace().any(|token| token == "test")
777    }))
778}
779
780fn is_pem_certificate_file(_path: &Path, buffer: &[u8]) -> bool {
781    let prefix_len = buffer.len().min(8192);
782    let prefix = String::from_utf8_lossy(&buffer[..prefix_len]);
783    let trimmed_lines: Vec<&str> = prefix
784        .lines()
785        .map(str::trim)
786        .filter(|line| !line.is_empty())
787        .take(64)
788        .collect();
789
790    PEM_CERTIFICATE_HEADERS.iter().any(|(begin, end)| {
791        trimmed_lines.iter().any(|line| line == begin)
792            && trimmed_lines.iter().any(|line| line == end)
793    })
794}
795
796fn process_directory(
797    path: &Path,
798    metadata: &fs::Metadata,
799    collect_info: bool,
800    license_enabled: bool,
801) -> FileInfo {
802    let name = path
803        .file_name()
804        .unwrap_or_default()
805        .to_string_lossy()
806        .to_string();
807    let base_name = name.clone(); // For directories, base_name is the same as name
808
809    FileInfo {
810        name,
811        base_name,
812        extension: "".to_string(),
813        path: path.to_string_lossy().to_string(),
814        file_type: FileType::Directory,
815        mime_type: None,
816        size: 0,
817        date: get_creation_date(metadata),
818        sha1: None,
819        md5: None,
820        sha256: None,
821        programming_language: None,
822        package_data: Vec::new(), // TODO: implement
823        license_expression: None,
824        license_detections: Vec::new(), // TODO: implement
825        license_clues: Vec::new(),      // TODO: implement
826        percentage_of_license_text: license_enabled.then_some(0.0),
827        copyrights: Vec::new(), // TODO: implement
828        holders: Vec::new(),    // TODO: implement
829        authors: Vec::new(),    // TODO: implement
830        emails: Vec::new(),     // TODO: implement
831        urls: Vec::new(),       // TODO: implement
832        for_packages: Vec::new(),
833        scan_errors: Vec::new(),
834        is_source: collect_info.then_some(false),
835        source_count: None,
836        is_legal: false,
837        is_manifest: false,
838        is_readme: false,
839        is_top_level: false,
840        is_key_file: false,
841        is_community: false,
842        is_generated: None,
843        facets: vec![],
844        tallies: None,
845    }
846}
847
848#[cfg(test)]
849mod tests {
850    use super::{
851        compute_percentage_of_license_text, convert_detection_to_model, is_go_non_production_source,
852    };
853    use crate::license_detection::LicenseDetection as InternalLicenseDetection;
854    use crate::license_detection::index::LicenseIndex;
855    use crate::license_detection::index::dictionary::TokenDictionary;
856    use crate::license_detection::models::{LicenseMatch, MatcherKind, RuleKind};
857    use crate::license_detection::query::Query;
858    use crate::scanner::LicenseScanOptions;
859    use std::fs;
860    use tempfile::tempdir;
861
862    fn make_internal_match(rule_url: &str) -> LicenseMatch {
863        LicenseMatch {
864            rid: 0,
865            license_expression: "mit".to_string(),
866            license_expression_spdx: Some("MIT".to_string()),
867            from_file: None,
868            start_line: 1,
869            end_line: 1,
870            start_token: 0,
871            end_token: 1,
872            matcher: MatcherKind::Hash,
873            score: 1.0,
874            matched_length: 3,
875            rule_length: 3,
876            match_coverage: 100.0,
877            rule_relevance: 100,
878            rule_identifier: "mit.LICENSE".to_string(),
879            rule_url: rule_url.to_string(),
880            matched_text: Some("MIT".to_string()),
881            referenced_filenames: None,
882            rule_kind: RuleKind::Text,
883            is_from_license: true,
884            matched_token_positions: None,
885            hilen: 3,
886            rule_start_token: 0,
887            qspan_positions: None,
888            ispan_positions: None,
889            hispan_positions: None,
890            candidate_resemblance: 0.0,
891            candidate_containment: 0.0,
892        }
893    }
894
895    fn make_detection(rule_url: &str) -> InternalLicenseDetection {
896        InternalLicenseDetection {
897            license_expression: Some("mit".to_string()),
898            license_expression_spdx: Some("MIT".to_string()),
899            matches: vec![make_internal_match(rule_url)],
900            detection_log: vec![],
901            identifier: Some("mit-test".to_string()),
902            file_regions: Vec::new(),
903        }
904    }
905
906    fn create_test_index(entries: &[(&str, u16)], len_legalese: usize) -> LicenseIndex {
907        let dictionary = TokenDictionary::new_with_legalese(entries);
908        let mut index = LicenseIndex::new(dictionary);
909        index.len_legalese = len_legalese;
910        index
911    }
912
913    #[test]
914    fn test_convert_detection_to_model_preserves_rule_url() {
915        let detection = make_detection(
916            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE",
917        );
918
919        let (converted, clues) =
920            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
921        let converted = converted.expect("detection should convert");
922
923        assert_eq!(
924            converted.matches[0].rule_url.as_deref(),
925            Some(
926                "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/mit.LICENSE"
927            )
928        );
929        assert!(clues.is_empty());
930    }
931
932    #[test]
933    fn test_convert_detection_to_model_emits_null_for_empty_rule_url() {
934        let detection = make_detection("");
935
936        let (converted, clues) =
937            convert_detection_to_model(&detection, LicenseScanOptions::default(), "", None);
938        let converted = converted.expect("detection should convert");
939
940        assert_eq!(converted.matches[0].rule_url, None);
941        assert!(clues.is_empty());
942    }
943
944    #[test]
945    fn test_convert_detection_to_model_routes_expressionless_detection_to_license_clues() {
946        let mut detection = make_detection(
947            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/license-clue_1.RULE",
948        );
949        detection.license_expression = None;
950        detection.license_expression_spdx = None;
951        detection.identifier = None;
952        detection.matches[0].license_expression = "unknown-license-reference".to_string();
953        detection.matches[0].license_expression_spdx =
954            Some("LicenseRef-scancode-unknown-license-reference".to_string());
955        detection.matches[0].rule_identifier = "license-clue_1.RULE".to_string();
956        detection.matches[0].rule_kind = RuleKind::Clue;
957
958        let (converted, clues) = convert_detection_to_model(
959            &detection,
960            LicenseScanOptions {
961                include_text: true,
962                ..LicenseScanOptions::default()
963            },
964            "clue text",
965            None,
966        );
967
968        assert!(converted.is_none());
969        assert_eq!(clues.len(), 1);
970        assert_eq!(clues[0].license_expression, "unknown-license-reference");
971        assert_eq!(
972            clues[0].license_expression_spdx,
973            "LicenseRef-scancode-unknown-license-reference"
974        );
975        assert_eq!(
976            clues[0].rule_identifier.as_deref(),
977            Some("license-clue_1.RULE")
978        );
979        assert_eq!(clues[0].matched_text.as_deref(), Some("MIT"));
980        assert_eq!(clues[0].matched_text_diagnostics, None);
981    }
982
983    #[test]
984    fn test_convert_detection_to_model_includes_diagnostics_when_enabled() {
985        let text = concat!(
986            "Reproduction and distribution of this file, with or without modification, are\n",
987            "permitted in any medium without royalties provided the copyright notice\n",
988            "and this notice are preserved. This file is offered as-is, without any warranties.\n",
989        );
990        let index = create_test_index(
991            &[
992                ("reproduction", 0),
993                ("distribution", 1),
994                ("file", 2),
995                ("without", 3),
996                ("modification", 4),
997                ("permitted", 5),
998                ("medium", 6),
999                ("royalties", 7),
1000                ("provided", 8),
1001                ("copyright", 9),
1002                ("notice", 10),
1003                ("preserved", 11),
1004                ("offered", 12),
1005                ("warranties", 13),
1006            ],
1007            14,
1008        );
1009        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1010        let mut detection = make_detection(
1011            "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/fsf-ap.LICENSE",
1012        );
1013        detection.detection_log = vec!["imperfect-match-coverage".to_string()];
1014        detection.matches[0].license_expression = "fsf-ap".to_string();
1015        detection.matches[0].license_expression_spdx = Some("FSFAP".to_string());
1016        detection.matches[0].rule_identifier = "fsf-ap.LICENSE".to_string();
1017        detection.matches[0].matched_text = None;
1018        detection.matches[0].start_line = 1;
1019        detection.matches[0].end_line = 3;
1020        detection.matches[0].start_token = 0;
1021        detection.matches[0].end_token = query.tokens.len();
1022        detection.matches[0].qspan_positions = Some(
1023            query
1024                .tokens
1025                .iter()
1026                .enumerate()
1027                .filter_map(|(idx, _)| (idx != 9).then_some(idx))
1028                .collect(),
1029        );
1030        detection.identifier = Some("fsf_ap-test".to_string());
1031
1032        let (converted, clues) = convert_detection_to_model(
1033            &detection,
1034            LicenseScanOptions {
1035                include_text: true,
1036                include_text_diagnostics: true,
1037                include_diagnostics: true,
1038                unknown_licenses: false,
1039            },
1040            text,
1041            Some(&query),
1042        );
1043        let converted = converted.expect("detection should convert");
1044
1045        assert!(clues.is_empty());
1046        assert_eq!(converted.detection_log, vec!["imperfect-match-coverage"]);
1047        assert_eq!(
1048            converted.matches[0].matched_text.as_deref(),
1049            Some(text.trim_end())
1050        );
1051        let diagnostics = converted.matches[0]
1052            .matched_text_diagnostics
1053            .as_deref()
1054            .expect("diagnostics should be present");
1055        assert!(diagnostics.contains('['));
1056        assert!(diagnostics.contains(']'));
1057        assert_ne!(diagnostics, text.trim_end());
1058    }
1059
1060    #[test]
1061    fn test_compute_percentage_of_license_text_counts_unknown_tokens() {
1062        let index = create_test_index(&[("alpha", 0), ("mit", 1)], 2);
1063        let text = "alpha MIT omega";
1064        let query = Query::from_extracted_text(text, &index, false).expect("query should build");
1065        let mut detection = make_detection("");
1066        detection.matches[0].qspan_positions = Some(vec![1]);
1067        detection.matches[0].start_token = 1;
1068        detection.matches[0].end_token = 2;
1069
1070        let percentage = compute_percentage_of_license_text(&query, &[detection]);
1071
1072        assert_eq!(percentage, 33.33);
1073    }
1074
1075    #[test]
1076    fn test_is_go_non_production_source_for_test_filename() {
1077        let temp_dir = tempdir().unwrap();
1078        let path = temp_dir.path().join("scanner_test.go");
1079        fs::write(&path, "package scanner\n").unwrap();
1080
1081        assert!(is_go_non_production_source(&path).unwrap());
1082    }
1083
1084    #[test]
1085    fn test_is_go_non_production_source_for_build_tag() {
1086        let temp_dir = tempdir().unwrap();
1087        let path = temp_dir.path().join("scanner.go");
1088        fs::write(&path, "//go:build test\n\npackage scanner\n").unwrap();
1089
1090        assert!(is_go_non_production_source(&path).unwrap());
1091    }
1092
1093    #[test]
1094    fn test_is_go_non_production_source_for_regular_go_file() {
1095        let temp_dir = tempdir().unwrap();
1096        let path = temp_dir.path().join("scanner.go");
1097        fs::write(&path, "package scanner\n").unwrap();
1098
1099        assert!(!is_go_non_production_source(&path).unwrap());
1100    }
1101}