Skip to main content

provenant/utils/
file.rs

1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23    None,
24    Decoded,
25    FontMetadata,
26    Pdf,
27    BinaryStrings,
28    ImageMetadata,
29    WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34    pub mime_type: String,
35    pub file_type: String,
36    pub programming_language: Option<String>,
37    pub is_binary: bool,
38    pub is_text: bool,
39    pub is_archive: bool,
40    pub is_media: bool,
41    pub is_source: bool,
42    pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
50    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
51];
52const BINARY_EXTENSIONS: &[&str] = &[
53    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
54];
55const ARCHIVE_EXTENSIONS: &[&str] = &[
56    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
57    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
58];
59
60/// Get the last modified date of a file as a `YYYY-MM-DD` string.
61pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
62    metadata.modified().ok().map(|time: std::time::SystemTime| {
63        let seconds_since_epoch = time
64            .duration_since(std::time::UNIX_EPOCH)
65            .unwrap()
66            .as_secs() as i64;
67
68        Utc.timestamp_opt(seconds_since_epoch, 0)
69            .single()
70            .unwrap_or_else(Utc::now)
71            .format("%Y-%m-%d")
72            .to_string()
73    })
74}
75
76/// Check if a path should be excluded based on a list of glob patterns.
77pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
78    let path_str = path.to_string_lossy();
79    let file_name = path
80        .file_name()
81        .map(|name| name.to_string_lossy())
82        .unwrap_or_default();
83
84    for pattern in exclude_patterns {
85        // Match against full path
86        if pattern.matches(&path_str) {
87            return true;
88        }
89
90        // Match against just the file/directory name
91        if pattern.matches(&file_name) {
92            return true;
93        }
94    }
95
96    false
97}
98
99/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
100///
101/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
102/// so it can decode any byte sequence. This matches Python ScanCode's use of
103/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
104pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
105    match String::from_utf8(bytes.to_vec()) {
106        Ok(s) => s,
107        Err(e) => {
108            let bytes = e.into_bytes();
109            if has_binary_control_chars(&bytes) {
110                return String::new();
111            }
112            bytes.iter().map(|&b| b as char).collect()
113        }
114    }
115}
116
117pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
118    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
119    (text, kind)
120}
121
122pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
123    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
124        return Cow::Borrowed(text);
125    };
126    if !matches!(
127        extension.to_ascii_lowercase().as_str(),
128        "md" | "markdown" | "html" | "htm"
129    ) {
130        return Cow::Borrowed(text);
131    }
132
133    let mut hints = Vec::new();
134    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
135        hints.push("Creative Commons Attribution 4.0 International License".to_string());
136    }
137    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
138    {
139        hints.push(
140            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
141                .to_string(),
142        );
143    }
144
145    hints.extend(extract_shields_license_badge_hints(text));
146
147    if hints.is_empty() {
148        Cow::Borrowed(text)
149    } else {
150        let mut augmented =
151            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
152        augmented.push_str(text);
153        augmented.push_str("\n\n");
154        for (index, hint) in hints.into_iter().enumerate() {
155            if index > 0 {
156                augmented.push('\n');
157            }
158            augmented.push_str(&hint);
159        }
160        Cow::Owned(augmented)
161    }
162}
163
164fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
165    let mut hints = Vec::new();
166    let mut rest = text;
167    let needle = "img.shields.io/badge/license-";
168
169    while let Some(index) = rest.find(needle) {
170        let start = index + needle.len();
171        let suffix = &rest[start..];
172        let end = suffix
173            .find([')', ']', '"', '\'', ' ', '\n'])
174            .unwrap_or(suffix.len());
175        let badge = &suffix[..end];
176        let Some(badge) = badge.strip_suffix(".svg") else {
177            rest = &suffix[end..];
178            continue;
179        };
180
181        let mut segments: Vec<_> = badge
182            .split('-')
183            .filter(|segment| !segment.is_empty())
184            .collect();
185        if segments.len() < 2 {
186            rest = &suffix[end..];
187            continue;
188        }
189        segments.pop();
190        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
191        if !candidate.is_empty() {
192            hints.push(canonical_shields_license_hint(&candidate));
193        }
194
195        rest = &suffix[end..];
196    }
197
198    hints.sort();
199    hints.dedup();
200    hints
201}
202
203fn canonical_shields_license_hint(candidate: &str) -> String {
204    match candidate.trim() {
205        "MIT" => "The MIT License".to_string(),
206        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
207        other => format!("{other} License"),
208    }
209}
210
211pub(crate) fn extract_text_for_detection_with_diagnostics(
212    path: &Path,
213    bytes: &[u8],
214) -> (String, ExtractedTextKind, Option<String>) {
215    let ext = path
216        .extension()
217        .and_then(|e| e.to_str())
218        .map(|s| s.to_ascii_lowercase());
219    let detected_format = detect_file_format(bytes);
220
221    if looks_like_rtf(bytes, ext.as_deref()) {
222        let text = extract_rtf_text(bytes);
223        return if text.trim().is_empty() {
224            (String::new(), ExtractedTextKind::None, None)
225        } else {
226            (text, ExtractedTextKind::Decoded, None)
227        };
228    }
229
230    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
231        let (text, scan_error) = extract_pdf_text(path, bytes);
232        return if text.is_empty() {
233            (String::new(), ExtractedTextKind::None, scan_error)
234        } else {
235            (text, ExtractedTextKind::Pdf, None)
236        };
237    }
238
239    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
240        let text = extract_image_metadata_text(bytes, format);
241        return if text.is_empty() {
242            if is_supported_image_container(bytes, format) {
243                (String::new(), ExtractedTextKind::None, None)
244            } else {
245                let decoded = decode_bytes_to_string(bytes);
246                if decoded.is_empty() {
247                    (String::new(), ExtractedTextKind::None, None)
248                } else {
249                    (decoded, ExtractedTextKind::Decoded, None)
250                }
251            }
252        } else {
253            (text, ExtractedTextKind::ImageMetadata, None)
254        };
255    }
256
257    if let Some(text) = extract_font_metadata_text(path, bytes) {
258        return (text, ExtractedTextKind::FontMetadata, None);
259    }
260
261    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
262    let large_opaque_binary = windows_executable_metadata_text.is_none()
263        && is_large_opaque_binary_candidate(bytes, detected_format);
264
265    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
266        return windows_metadata_or_empty_result(windows_executable_metadata_text);
267    }
268
269    if should_skip_binary_string_extraction(path, bytes, detected_format) {
270        return (String::new(), ExtractedTextKind::None, None);
271    }
272
273    if !large_opaque_binary {
274        let decoded = decode_bytes_to_string(bytes);
275        if !decoded.is_empty() {
276            let combined =
277                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
278            return (combined, ExtractedTextKind::Decoded, None);
279        }
280    }
281
282    let text = if large_opaque_binary {
283        extract_sampled_printable_strings(bytes)
284    } else {
285        extract_printable_strings(bytes)
286    };
287    if text.is_empty() {
288        windows_metadata_or_empty_result(windows_executable_metadata_text)
289    } else {
290        (
291            combine_extracted_text_fragments(windows_executable_metadata_text, text),
292            ExtractedTextKind::BinaryStrings,
293            None,
294        )
295    }
296}
297
298fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
299    match prefix {
300        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
301        Some(prefix) if !prefix.is_empty() => prefix,
302        _ => suffix,
303    }
304}
305
306fn windows_metadata_or_empty_result(
307    windows_executable_metadata_text: Option<String>,
308) -> (String, ExtractedTextKind, Option<String>) {
309    if let Some(metadata_text) = windows_executable_metadata_text {
310        (
311            metadata_text,
312            ExtractedTextKind::WindowsExecutableMetadata,
313            None,
314        )
315    } else {
316        (String::new(), ExtractedTextKind::None, None)
317    }
318}
319
320pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
321    let detected_format = detect_file_format(bytes);
322    let detected_language = detect_language(path, bytes);
323    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
324    let is_text = !is_binary;
325    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
326    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
327    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
328    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
329    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
330    let programming_language = is_source.then(|| detected_language.clone()).flatten();
331    let file_type = detect_file_type(
332        path,
333        bytes,
334        detected_format,
335        &mime_type,
336        programming_language.as_deref(),
337        is_binary,
338        is_text,
339        is_archive,
340        is_media,
341        is_script,
342    );
343
344    FileInfoClassification {
345        mime_type,
346        file_type,
347        programming_language,
348        is_binary,
349        is_text,
350        is_archive,
351        is_media,
352        is_source,
353        is_script,
354    }
355}
356
357fn detect_file_format(bytes: &[u8]) -> FileFormat {
358    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
359}
360
361fn is_utf8_text(bytes: &[u8]) -> bool {
362    std::str::from_utf8(bytes).is_ok()
363}
364
365fn has_binary_control_chars(bytes: &[u8]) -> bool {
366    let control_count = bytes
367        .iter()
368        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
369        .count();
370    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
371}
372
373fn has_decodable_text(bytes: &[u8]) -> bool {
374    bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
375}
376
377fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
378    if bytes.is_empty() || is_utf8_text(bytes) {
379        return true;
380    }
381
382    let printable_count = bytes
383        .iter()
384        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
385        .count();
386    printable_count * 2 >= bytes.len()
387}
388
389fn is_textual_media_type(media_type: &str) -> bool {
390    media_type.starts_with("text/")
391        || matches!(
392            media_type,
393            "application/json" | "application/xml" | "text/xml"
394        )
395        || media_type.ends_with("+json")
396        || media_type.ends_with("+xml")
397}
398
399fn is_textual_format(detected_format: FileFormat) -> bool {
400    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
401        || is_textual_media_type(detected_format.media_type())
402}
403
404fn is_known_binary_format(detected_format: FileFormat) -> bool {
405    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
406        && !is_textual_format(detected_format)
407}
408
409pub fn detect_mime_type(
410    path: &Path,
411    bytes: &[u8],
412    detected_format: FileFormat,
413    programming_language: Option<&str>,
414) -> String {
415    if bytes.is_empty() {
416        return "inode/x-empty".to_string();
417    }
418
419    if is_zip_archive(bytes) {
420        return detect_zip_like_mime(path);
421    }
422
423    if looks_like_deb(bytes, path) {
424        return "application/vnd.debian.binary-package".to_string();
425    }
426
427    if looks_like_rpm(bytes, path) {
428        return "application/x-rpm".to_string();
429    }
430
431    let guessed_mime = from_path(path)
432        .first_or_octet_stream()
433        .essence_str()
434        .to_string();
435
436    let mime_type = match detected_format {
437        FileFormat::Empty => "inode/x-empty".to_string(),
438        FileFormat::PlainText => {
439            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
440                "text/plain".to_string()
441            } else {
442                guessed_mime.clone()
443            }
444        }
445        _ => {
446            let detected_mime = detected_format.media_type();
447            if detected_mime == "application/octet-stream"
448                && guessed_mime != "application/octet-stream"
449            {
450                guessed_mime.clone()
451            } else {
452                detected_mime.to_string()
453            }
454        }
455    };
456
457    normalize_mime_type(path, bytes, programming_language, &mime_type)
458}
459
460fn normalize_mime_type(
461    path: &Path,
462    bytes: &[u8],
463    programming_language: Option<&str>,
464    mime_type: &str,
465) -> String {
466    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
467        return "text/plain".to_string();
468    }
469
470    mime_type.to_string()
471}
472
473fn should_prefer_text_mime(
474    path: &Path,
475    bytes: &[u8],
476    programming_language: Option<&str>,
477    mime_type: &str,
478) -> bool {
479    has_decodable_text(bytes)
480        && looks_like_textual_bytes(bytes)
481        && is_textual_source_candidate(path, programming_language)
482        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
483}
484
485fn detect_is_binary(
486    path: &Path,
487    bytes: &[u8],
488    detected_format: FileFormat,
489    programming_language: Option<&str>,
490) -> bool {
491    if is_textual_format(detected_format) {
492        return false;
493    }
494
495    if lower_extension(path)
496        .as_deref()
497        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
498    {
499        return true;
500    }
501
502    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
503        return false;
504    }
505
506    has_binary_control_chars(bytes)
507        || is_known_binary_format(detected_format)
508        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
509            && !looks_like_textual_bytes(bytes))
510}
511
512fn should_treat_binary_bytes_as_text(
513    path: &Path,
514    bytes: &[u8],
515    programming_language: Option<&str>,
516) -> bool {
517    has_decodable_text(bytes)
518        && looks_like_textual_bytes(bytes)
519        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
520}
521
522fn detect_is_archive(
523    path: &Path,
524    bytes: &[u8],
525    mime_type: &str,
526    is_text: bool,
527    detected_format: FileFormat,
528) -> bool {
529    if is_text {
530        return false;
531    }
532
533    lower_extension(path)
534        .as_deref()
535        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
536        || matches!(
537            detected_format.kind(),
538            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
539        )
540        || is_zip_archive(bytes)
541        || looks_like_gzip(bytes)
542        || looks_like_bzip2(bytes)
543        || looks_like_xz(bytes)
544        || looks_like_deb(bytes, path)
545        || looks_like_rpm(bytes, path)
546        || looks_like_squashfs(bytes, path)
547        || mime_type.contains("zip")
548        || mime_type.contains("compressed")
549        || mime_type.contains("tar")
550        || mime_type.contains("x-rpm")
551        || mime_type.contains("debian")
552}
553
554fn detect_is_media(
555    path: &Path,
556    bytes: &[u8],
557    mime_type: &str,
558    detected_format: FileFormat,
559) -> bool {
560    media_mime_from_content(bytes).is_some()
561        || matches!(
562            detected_format.kind(),
563            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
564        )
565        || mime_type.starts_with("image/")
566        || mime_type.starts_with("audio/")
567        || mime_type.starts_with("video/")
568        || (mime_type == "application/octet-stream"
569            && lower_extension(path).as_deref() == Some("tga")
570            && !has_binary_control_chars(bytes))
571}
572
573fn detect_is_script(
574    path: &Path,
575    bytes: &[u8],
576    programming_language: Option<&str>,
577    is_text: bool,
578) -> bool {
579    if !is_text || is_makefile(path) {
580        return false;
581    }
582
583    bytes.starts_with(b"#!")
584        || lower_extension(path).as_deref().is_some_and(|ext| {
585            matches!(
586                ext,
587                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
588            )
589        })
590        || matches!(
591            programming_language,
592            Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
593        )
594}
595
596fn detect_is_source(
597    path: &Path,
598    programming_language: Option<&str>,
599    is_text: bool,
600    is_script: bool,
601) -> bool {
602    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
603        return false;
604    }
605
606    if is_c_like_source(path) || is_java_like_source(path) {
607        return true;
608    }
609
610    programming_language.is_some() || is_script
611}
612
613#[allow(clippy::too_many_arguments)]
614fn detect_file_type(
615    path: &Path,
616    bytes: &[u8],
617    detected_format: FileFormat,
618    mime_type: &str,
619    programming_language: Option<&str>,
620    is_binary: bool,
621    is_text: bool,
622    is_archive: bool,
623    is_media: bool,
624    is_script: bool,
625) -> String {
626    if bytes.is_empty() {
627        return "empty".to_string();
628    }
629
630    if looks_like_pdf(bytes) {
631        return "PDF document".to_string();
632    }
633
634    if let Some(file_type) = media_file_type_from_content(bytes) {
635        return file_type.to_string();
636    }
637
638    if is_archive {
639        return archive_file_type(path, bytes, detected_format);
640    }
641
642    if is_script {
643        return script_file_type(programming_language, bytes);
644    }
645
646    if is_text {
647        if lower_extension(path).as_deref() == Some("json") {
648            return "JSON text data".to_string();
649        }
650        if lower_extension(path).as_deref() == Some("xml") {
651            return "XML text data".to_string();
652        }
653        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
654            return "YAML text data".to_string();
655        }
656        if lower_extension(path).as_deref() == Some("toml") {
657            return "TOML text data".to_string();
658        }
659        if matches!(
660            lower_extension(path).as_deref(),
661            Some("ini" | "cfg" | "conf")
662        ) {
663            return "INI text data".to_string();
664        }
665        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
666            return "Git configuration text".to_string();
667        }
668        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
669            return text_file_type(bytes);
670        }
671        if programming_language.is_some() && !is_media {
672            return text_file_type(bytes);
673        }
674        return text_file_type(bytes);
675    }
676
677    if let Some(file_type) = format_based_file_type(detected_format) {
678        return file_type;
679    }
680
681    if is_binary && mime_type == "application/octet-stream" {
682        return "data".to_string();
683    }
684
685    mime_type.to_string()
686}
687
688fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
689    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
690        return true;
691    }
692
693    if matches!(
694        lower_file_name(path).as_str(),
695        "dockerfile"
696            | "containerfile"
697            | "containerfile.core"
698            | "apkbuild"
699            | "podfile"
700            | "meson.build"
701            | "build"
702            | "workspace"
703            | "buck"
704            | "default.nix"
705            | "flake.nix"
706            | "shell.nix"
707    ) {
708        return true;
709    }
710
711    path.extension()
712        .and_then(|ext| ext.to_str())
713        .is_some_and(|ext| {
714            matches!(
715                ext.to_ascii_lowercase().as_str(),
716                "rs" | "py"
717                    | "js"
718                    | "mjs"
719                    | "cjs"
720                    | "jsx"
721                    | "ts"
722                    | "mts"
723                    | "cts"
724                    | "tsx"
725                    | "c"
726                    | "cpp"
727                    | "cc"
728                    | "cxx"
729                    | "h"
730                    | "hpp"
731                    | "m"
732                    | "mm"
733                    | "s"
734                    | "asm"
735                    | "java"
736                    | "go"
737                    | "rb"
738                    | "php"
739                    | "pl"
740                    | "swift"
741                    | "sh"
742                    | "bash"
743                    | "zsh"
744                    | "fish"
745                    | "ksh"
746                    | "ps1"
747                    | "psm1"
748                    | "psd1"
749                    | "awk"
750                    | "kt"
751                    | "kts"
752                    | "dart"
753                    | "scala"
754                    | "groovy"
755                    | "gradle"
756                    | "gvy"
757                    | "gy"
758                    | "gsh"
759                    | "cs"
760                    | "fs"
761                    | "fsx"
762                    | "r"
763                    | "lua"
764                    | "jl"
765                    | "ex"
766                    | "exs"
767                    | "clj"
768                    | "cljs"
769                    | "cljc"
770                    | "hs"
771                    | "erl"
772                    | "nix"
773                    | "zig"
774                    | "bzl"
775                    | "bazel"
776                    | "star"
777                    | "sky"
778                    | "ml"
779                    | "mli"
780                    | "tex"
781            )
782        })
783}
784
785fn is_source_like_language(language: &str) -> bool {
786    matches!(
787        language,
788        "Rust"
789            | "Python"
790            | "JavaScript"
791            | "TypeScript"
792            | "JavaScript/TypeScript"
793            | "C"
794            | "C++"
795            | "Objective-C"
796            | "Objective-C++"
797            | "GAS"
798            | "Java"
799            | "Go"
800            | "Ruby"
801            | "PHP"
802            | "Perl"
803            | "Swift"
804            | "Shell"
805            | "PowerShell"
806            | "Awk"
807            | "Kotlin"
808            | "Dart"
809            | "Scala"
810            | "C#"
811            | "F#"
812            | "R"
813            | "Lua"
814            | "Julia"
815            | "Elixir"
816            | "Clojure"
817            | "Haskell"
818            | "Erlang"
819            | "Groovy"
820            | "Nix"
821            | "Zig"
822            | "Starlark"
823            | "OCaml"
824            | "Meson"
825            | "TeX"
826            | "Dockerfile"
827            | "Makefile"
828    )
829}
830
831fn extension(path: &Path) -> Option<&str> {
832    path.extension().and_then(|ext| ext.to_str())
833}
834
835fn lower_extension(path: &Path) -> Option<String> {
836    extension(path).map(|ext| ext.to_ascii_lowercase())
837}
838
839fn lower_file_name(path: &Path) -> String {
840    path.file_name()
841        .and_then(|name| name.to_str())
842        .map(|name| name.to_ascii_lowercase())
843        .unwrap_or_default()
844}
845
846fn is_plain_text(path: &Path) -> bool {
847    lower_extension(path)
848        .as_deref()
849        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
850}
851
852fn is_makefile(path: &Path) -> bool {
853    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
854}
855
856fn is_source_map(path: &Path) -> bool {
857    let path_lower = path.to_string_lossy().to_ascii_lowercase();
858    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
859}
860
861fn is_c_like_source(path: &Path) -> bool {
862    lower_extension(path).as_deref().is_some_and(|ext| {
863        matches!(
864            ext,
865            "c" | "cc"
866                | "cp"
867                | "cpp"
868                | "cxx"
869                | "c++"
870                | "h"
871                | "hh"
872                | "hpp"
873                | "hxx"
874                | "h++"
875                | "i"
876                | "ii"
877                | "m"
878                | "s"
879                | "asm"
880        )
881    })
882}
883
884fn is_java_like_source(path: &Path) -> bool {
885    lower_extension(path)
886        .as_deref()
887        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
888}
889
890fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
891    match detected_format {
892        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
893        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
894        format => Some(match format.kind() {
895            FileFormatKind::Image => short_name_or_name(&format, "image data"),
896            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
897            FileFormatKind::Video => short_name_or_name(&format, "video data"),
898            _ => format.name().to_string(),
899        }),
900    }
901}
902
903fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
904    format
905        .short_name()
906        .map(|short_name| format!("{short_name} {suffix}"))
907        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
908}
909
910fn detect_zip_like_mime(path: &Path) -> String {
911    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
912        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
913        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
914            "application/java-archive".to_string()
915        }
916        _ => "application/zip".to_string(),
917    }
918}
919
920fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
921    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
922        Some("image/png")
923    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
924        Some("image/jpeg")
925    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
926        Some("image/tiff")
927    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
928        Some("image/webp")
929    } else {
930        None
931    }
932}
933
934fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
935    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
936        Some("PNG image data")
937    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
938        Some("JPEG image data")
939    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
940        Some("TIFF image data")
941    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
942        Some("WebP image data")
943    } else {
944        None
945    }
946}
947
948fn looks_like_pdf(bytes: &[u8]) -> bool {
949    bytes.starts_with(b"%PDF-")
950}
951
952fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
953    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
954}
955
956fn extract_rtf_text(bytes: &[u8]) -> String {
957    let text = String::from_utf8_lossy(bytes);
958    let chars: Vec<char> = text.chars().collect();
959    let mut output = String::new();
960    let mut index = 0usize;
961
962    while index < chars.len() {
963        match chars[index] {
964            '{' | '}' => {
965                index += 1;
966            }
967            '\\' => {
968                index += 1;
969                if index >= chars.len() {
970                    break;
971                }
972
973                match chars[index] {
974                    '\\' | '{' | '}' => {
975                        output.push(chars[index]);
976                        index += 1;
977                    }
978                    '\'' => {
979                        if index + 2 < chars.len() {
980                            let hex = [chars[index + 1], chars[index + 2]];
981                            let hex: String = hex.iter().collect();
982                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
983                                output.push(value as char);
984                                index += 3;
985                                continue;
986                            }
987                        }
988                        index += 1;
989                    }
990                    control if control.is_ascii_alphabetic() => {
991                        let start = index;
992                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
993                            index += 1;
994                        }
995                        let control_word: String = chars[start..index].iter().collect();
996
997                        let number_start = index;
998                        if index < chars.len()
999                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1000                        {
1001                            index += 1;
1002                            while index < chars.len() && chars[index].is_ascii_digit() {
1003                                index += 1;
1004                            }
1005                        }
1006                        let parameter: String = chars[number_start..index].iter().collect();
1007
1008                        if index < chars.len() && chars[index] == ' ' {
1009                            index += 1;
1010                        }
1011
1012                        match control_word.as_str() {
1013                            "par" | "line" => output.push('\n'),
1014                            "tab" => output.push('\t'),
1015                            "emdash" => output.push('—'),
1016                            "endash" => output.push('–'),
1017                            "bullet" => output.push('•'),
1018                            "lquote" | "rquote" => output.push('\''),
1019                            "ldblquote" | "rdblquote" => output.push('"'),
1020                            "u" => {
1021                                if let Ok(codepoint) = parameter.parse::<i32>() {
1022                                    let normalized = if codepoint < 0 {
1023                                        codepoint + 65_536
1024                                    } else {
1025                                        codepoint
1026                                    };
1027                                    if let Ok(normalized) = u32::try_from(normalized)
1028                                        && let Some(ch) = char::from_u32(normalized)
1029                                    {
1030                                        output.push(ch);
1031                                    }
1032                                }
1033
1034                                if index < chars.len()
1035                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1036                                {
1037                                    index += 1;
1038                                }
1039                            }
1040                            _ => {}
1041                        }
1042                    }
1043                    _ => {
1044                        index += 1;
1045                    }
1046                }
1047            }
1048            ch => {
1049                output.push(ch);
1050                index += 1;
1051            }
1052        }
1053    }
1054
1055    output
1056        .replace(['\r', '\u{0c}'], "\n")
1057        .lines()
1058        .map(str::trim_end)
1059        .collect::<Vec<_>>()
1060        .join("\n")
1061}
1062
1063fn looks_like_gzip(bytes: &[u8]) -> bool {
1064    bytes.starts_with(&[0x1f, 0x8b])
1065}
1066
1067fn looks_like_bzip2(bytes: &[u8]) -> bool {
1068    bytes.starts_with(b"BZh")
1069}
1070
1071fn looks_like_xz(bytes: &[u8]) -> bool {
1072    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1073}
1074
1075fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1076    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1077}
1078
1079fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1080    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1081}
1082
1083fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1084    lower_extension(path)
1085        .as_deref()
1086        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1087        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1088            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1089}
1090
1091fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1092    if looks_like_deb(bytes, path) {
1093        "debian binary package (format 2.0)".to_string()
1094    } else if looks_like_rpm(bytes, path) {
1095        "RPM package".to_string()
1096    } else if looks_like_squashfs(bytes, path) {
1097        "Squashfs filesystem".to_string()
1098    } else if looks_like_gzip(bytes) {
1099        "gzip compressed data".to_string()
1100    } else if looks_like_bzip2(bytes) {
1101        "bzip2 compressed data".to_string()
1102    } else if looks_like_xz(bytes) {
1103        "XZ compressed data".to_string()
1104    } else if is_zip_archive(bytes) {
1105        "Zip archive data".to_string()
1106    } else if lower_extension(path).as_deref() == Some("gem") {
1107        "POSIX tar archive".to_string()
1108    } else if let Some(file_type) = format_based_file_type(detected_format) {
1109        file_type
1110    } else {
1111        "archive data".to_string()
1112    }
1113}
1114
1115fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1116    let suffix = text_executable_label(bytes);
1117
1118    match programming_language {
1119        Some("Python") => format!("python script, {suffix}"),
1120        Some("Ruby") => format!("ruby script, {suffix}"),
1121        Some("Perl") => format!("perl script, {suffix}"),
1122        Some("PHP") => format!("php script, {suffix}"),
1123        Some("Shell") => format!("shell script, {suffix}"),
1124        Some("JavaScript") => format!("javascript script, {suffix}"),
1125        Some("TypeScript") => format!("typescript script, {suffix}"),
1126        Some("PowerShell") => format!("powershell script, {suffix}"),
1127        Some("Awk") => format!("awk script, {suffix}"),
1128        _ => format!("script, {suffix}"),
1129    }
1130}
1131
1132fn text_file_type(bytes: &[u8]) -> String {
1133    text_label(bytes).to_string()
1134}
1135
1136fn text_label(bytes: &[u8]) -> &'static str {
1137    if std::str::from_utf8(bytes).is_ok() {
1138        if bytes.contains(&b'\n') {
1139            "UTF-8 Unicode text"
1140        } else {
1141            "UTF-8 Unicode text, with no line terminators"
1142        }
1143    } else if bytes.contains(&b'\n') {
1144        "text"
1145    } else {
1146        "text, with no line terminators"
1147    }
1148}
1149
1150fn text_executable_label(bytes: &[u8]) -> &'static str {
1151    if std::str::from_utf8(bytes).is_ok() {
1152        if bytes.contains(&b'\n') {
1153            "UTF-8 Unicode text executable"
1154        } else {
1155            "UTF-8 Unicode text executable, with no line terminators"
1156        }
1157    } else if bytes.contains(&b'\n') {
1158        "text executable"
1159    } else {
1160        "text executable, with no line terminators"
1161    }
1162}
1163
1164fn supported_image_metadata_format(
1165    ext: Option<&str>,
1166    detected_format: FileFormat,
1167) -> Option<ImageFormat> {
1168    match ext {
1169        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1170        Some("png") => Some(ImageFormat::Png),
1171        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1172        Some("webp") => Some(ImageFormat::WebP),
1173        _ => match detected_format.media_type() {
1174            "image/jpeg" => Some(ImageFormat::Jpeg),
1175            "image/png" => Some(ImageFormat::Png),
1176            "image/tiff" => Some(ImageFormat::Tiff),
1177            "image/webp" => Some(ImageFormat::WebP),
1178            _ => None,
1179        },
1180    }
1181}
1182
1183fn should_skip_binary_string_extraction(
1184    path: &Path,
1185    bytes: &[u8],
1186    detected_format: FileFormat,
1187) -> bool {
1188    matches!(lower_extension(path).as_deref(), Some("pdf"))
1189        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1190            .is_some()
1191        || (matches!(
1192            detected_format.kind(),
1193            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1194        ) && !is_textual_format(detected_format))
1195        || media_mime_from_content(bytes).is_some()
1196        || is_zip_archive(bytes)
1197        || looks_like_gzip(bytes)
1198        || looks_like_bzip2(bytes)
1199        || looks_like_xz(bytes)
1200        || looks_like_deb(bytes, path)
1201        || looks_like_rpm(bytes, path)
1202        || looks_like_squashfs(bytes, path)
1203}
1204
1205fn should_skip_large_opaque_binary_text_extraction(
1206    _path: &Path,
1207    bytes: &[u8],
1208    detected_format: FileFormat,
1209) -> bool {
1210    is_large_opaque_binary_candidate(bytes, detected_format)
1211        && !sample_has_promising_printable_strings(bytes)
1212}
1213
1214fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1215    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1216        && !is_textual_format(detected_format)
1217        && !matches!(
1218            detected_format.kind(),
1219            FileFormatKind::Archive
1220                | FileFormatKind::Compressed
1221                | FileFormatKind::Package
1222                | FileFormatKind::Audio
1223                | FileFormatKind::Image
1224                | FileFormatKind::Video
1225        )
1226}
1227
1228fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1229    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1230
1231    let mut ranges = Vec::new();
1232    let mut push_range = |start: usize, end: usize| {
1233        if start < end && !ranges.contains(&(start, end)) {
1234            ranges.push((start, end));
1235        }
1236    };
1237
1238    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1239    if len > SAMPLE_WINDOW_BYTES * 2 {
1240        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1241        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1242        push_range(mid_start, mid_end);
1243    }
1244    if len > SAMPLE_WINDOW_BYTES {
1245        push_range(len - SAMPLE_WINDOW_BYTES, len);
1246    }
1247
1248    ranges
1249}
1250
1251fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1252    let mut structured_signal_seen = false;
1253    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1254        .into_iter()
1255        .filter(|&(start, end)| {
1256            let window = &bytes[start..end];
1257            if has_strong_structured_text_signal(window) {
1258                structured_signal_seen = true;
1259            }
1260            has_license_or_notice_signal(window)
1261        })
1262        .count();
1263
1264    structured_signal_seen || promising_license_windows >= 2
1265}
1266
1267fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1268    let mut combined_lines = BTreeSet::new();
1269
1270    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1271        let window_text = extract_printable_strings(&bytes[start..end]);
1272        for line in window_text
1273            .lines()
1274            .map(str::trim)
1275            .filter(|line| !line.is_empty())
1276        {
1277            combined_lines.insert(line.to_string());
1278        }
1279    }
1280
1281    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1282}
1283
1284fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1285    let strings = extract_printable_strings(bytes);
1286    if strings.is_empty() {
1287        return false;
1288    }
1289
1290    let lower = strings.to_ascii_lowercase();
1291    [
1292        "copyright",
1293        "license",
1294        "licensed under",
1295        "all rights reserved",
1296        "permission is hereby granted",
1297        "redistribution and use",
1298        "spdx-license-identifier",
1299    ]
1300    .iter()
1301    .any(|marker| lower.contains(marker))
1302}
1303
1304fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1305    let strings = extract_printable_strings(bytes);
1306    if strings.is_empty() {
1307        return false;
1308    }
1309
1310    let email_markers = strings.matches('@').count();
1311    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1312
1313    email_markers + url_markers >= 3
1314}
1315
1316fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1317    match format {
1318        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1319        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1320        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1321        ImageFormat::WebP => {
1322            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1323        }
1324        _ => false,
1325    }
1326}
1327
1328fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1329    let mut values = Vec::new();
1330    values.extend(extract_exif_metadata_values(bytes));
1331    values.extend(extract_xmp_metadata_values(bytes, format));
1332    values_to_text(values)
1333}
1334
1335fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1336    let mut cursor = BufReader::new(Cursor::new(bytes));
1337    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1338        Ok(exif) => exif,
1339        Err(_) => return Vec::new(),
1340    };
1341
1342    let mut values = Vec::new();
1343    for field in exif.fields() {
1344        let rendered = match field.tag {
1345            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1346                Some(field.display_value().with_unit(&exif).to_string())
1347            }
1348            exif::Tag::Artist => Some(format!(
1349                "Author: {}",
1350                field.display_value().with_unit(&exif)
1351            )),
1352            _ => None,
1353        };
1354
1355        if let Some(rendered) = rendered {
1356            values.push(rendered);
1357        }
1358    }
1359
1360    values
1361}
1362
1363fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1364    let xmp = match extract_raw_xmp_packet(bytes, format) {
1365        Some(xmp) => xmp,
1366        None => return Vec::new(),
1367    };
1368
1369    parse_xmp_values(&xmp)
1370}
1371
1372fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1373    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1374    if let Ok(mut decoder) = reader.into_decoder()
1375        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1376    {
1377        return Some(xmp);
1378    }
1379
1380    match format {
1381        ImageFormat::Png => extract_png_xmp_packet(bytes),
1382        _ => None,
1383    }
1384}
1385
1386fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1387    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1388
1389    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1390        return None;
1391    }
1392
1393    let mut offset = PNG_SIGNATURE.len();
1394    while offset + 12 <= bytes.len() {
1395        let length = u32::from_be_bytes([
1396            bytes[offset],
1397            bytes[offset + 1],
1398            bytes[offset + 2],
1399            bytes[offset + 3],
1400        ]) as usize;
1401        let chunk_start = offset + 8;
1402        let chunk_end = chunk_start + length;
1403        if chunk_end + 4 > bytes.len() {
1404            return None;
1405        }
1406
1407        let chunk_type = &bytes[offset + 4..offset + 8];
1408        if chunk_type == b"iTXt" {
1409            let data = &bytes[chunk_start..chunk_end];
1410            if let Some(xmp) = parse_png_itxt_xmp(data) {
1411                return Some(xmp);
1412            }
1413        }
1414
1415        offset = chunk_end + 4;
1416    }
1417
1418    None
1419}
1420
1421fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1422    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1423
1424    let keyword_end = data.iter().position(|&b| b == 0)?;
1425    if &data[..keyword_end] != XMP_KEYWORD {
1426        return None;
1427    }
1428
1429    let mut cursor = keyword_end + 1;
1430    let compression_flag = *data.get(cursor)?;
1431    cursor += 1;
1432    let compression_method = *data.get(cursor)?;
1433    cursor += 1;
1434    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1435        return None;
1436    }
1437
1438    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1439    cursor = language_end + 1;
1440
1441    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1442    cursor = translated_end + 1;
1443
1444    let text_bytes = &data[cursor..];
1445    if compression_flag == 1 {
1446        let mut decoder = ZlibDecoder::new(text_bytes);
1447        let mut decoded = Vec::new();
1448        decoder.read_to_end(&mut decoded).ok()?;
1449        Some(decoded)
1450    } else {
1451        Some(text_bytes.to_vec())
1452    }
1453}
1454
1455fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1456    let mut reader = XmlReader::from_reader(xmp);
1457    reader.config_mut().trim_text(true);
1458
1459    let mut buf = Vec::new();
1460    let mut stack: Vec<String> = Vec::new();
1461    let mut values = Vec::new();
1462
1463    loop {
1464        match reader.read_event_into(&mut buf) {
1465            Ok(Event::Start(e)) => {
1466                stack.push(local_xml_name(e.name().as_ref()));
1467            }
1468            Ok(Event::End(_)) => {
1469                stack.pop();
1470            }
1471            Ok(Event::Empty(_)) => {}
1472            Ok(Event::Text(text)) => {
1473                if let Some(field) = stack
1474                    .iter()
1475                    .rev()
1476                    .find_map(|name| allowed_xmp_field(name.as_str()))
1477                    && let Ok(decoded) = text.decode()
1478                {
1479                    let decoded = decoded.into_owned();
1480                    if !decoded.trim().is_empty() {
1481                        values.push(format_xmp_value(field, &decoded));
1482                    }
1483                }
1484            }
1485            Ok(Event::CData(text)) => {
1486                if let Some(field) = stack
1487                    .iter()
1488                    .rev()
1489                    .find_map(|name| allowed_xmp_field(name.as_str()))
1490                    && let Ok(decoded) = text.decode()
1491                {
1492                    let decoded = decoded.into_owned();
1493                    if !decoded.trim().is_empty() {
1494                        values.push(format_xmp_value(field, &decoded));
1495                    }
1496                }
1497            }
1498            Ok(Event::Eof) | Err(_) => break,
1499            _ => {}
1500        }
1501        buf.clear();
1502    }
1503
1504    values
1505}
1506
1507fn local_xml_name(name: &[u8]) -> String {
1508    let name = std::str::from_utf8(name).unwrap_or_default();
1509    name.rsplit(':').next().unwrap_or(name).to_string()
1510}
1511
1512fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1513    match name {
1514        "creator" => Some("creator"),
1515        "rights" => Some("rights"),
1516        "description" => Some("description"),
1517        "title" => Some("title"),
1518        "subject" => Some("subject"),
1519        "UsageTerms" => Some("usage_terms"),
1520        "WebStatement" => Some("web_statement"),
1521        _ => None,
1522    }
1523}
1524
1525fn format_xmp_value(field: &str, value: &str) -> String {
1526    match field {
1527        "creator" => format!("Author: {value}"),
1528        _ => value.to_string(),
1529    }
1530}
1531
1532fn values_to_text(values: Vec<String>) -> String {
1533    let mut seen = BTreeSet::new();
1534    let mut lines = Vec::new();
1535    let mut total_bytes = 0usize;
1536
1537    for value in values {
1538        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1539            break;
1540        }
1541
1542        let normalized = normalize_metadata_value(&value);
1543        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1544            continue;
1545        }
1546
1547        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1548        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1549            break;
1550        }
1551
1552        total_bytes += added_bytes;
1553        lines.push(normalized);
1554    }
1555
1556    lines.join("\n")
1557}
1558
1559fn normalize_metadata_value(value: &str) -> String {
1560    value
1561        .chars()
1562        .filter(|&ch| ch != '\0')
1563        .collect::<String>()
1564        .split_whitespace()
1565        .collect::<Vec<_>>()
1566        .join(" ")
1567        .trim()
1568        .to_string()
1569}
1570
1571fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1572    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1573        return (String::new(), None);
1574    }
1575
1576    let mut failures = Vec::new();
1577    let mut saw_success = false;
1578
1579    let extracted = catch_unwind(AssertUnwindSafe(
1580        || -> Result<String, Box<dyn std::error::Error>> {
1581            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1582            extract_first_pdf_page_text(&mut document)
1583        },
1584    ));
1585    match extracted {
1586        Ok(Ok(text)) => {
1587            saw_success = true;
1588            if let Some(normalized) = normalize_pdf_text(text) {
1589                return (normalized, None);
1590            }
1591        }
1592        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1593        Err(payload) => failures.push(format!(
1594            "from-bytes first-page panic: {}",
1595            panic_payload_to_string(payload.as_ref())
1596        )),
1597    }
1598
1599    let extracted = catch_unwind(AssertUnwindSafe(
1600        || -> Result<String, Box<dyn std::error::Error>> {
1601            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1602            extract_pdf_text_from_document(&mut document)
1603        },
1604    ));
1605    match extracted {
1606        Ok(Ok(text)) => {
1607            saw_success = true;
1608            if let Some(normalized) = normalize_pdf_text(text) {
1609                return (normalized, None);
1610            }
1611        }
1612        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1613        Err(payload) => failures.push(format!(
1614            "open full-document panic: {}",
1615            panic_payload_to_string(payload.as_ref())
1616        )),
1617    }
1618
1619    let extracted = catch_unwind(AssertUnwindSafe(
1620        || -> Result<String, Box<dyn std::error::Error>> {
1621            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1622            extract_pdf_text_from_document(&mut document)
1623        },
1624    ));
1625    match extracted {
1626        Ok(Ok(text)) => {
1627            saw_success = true;
1628            if let Some(normalized) = normalize_pdf_text(text) {
1629                return (normalized, None);
1630            }
1631        }
1632        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1633        Err(payload) => failures.push(format!(
1634            "from-bytes full-document panic: {}",
1635            panic_payload_to_string(payload.as_ref())
1636        )),
1637    }
1638
1639    if saw_success || is_non_actionable_pdf_failure(&failures) {
1640        (String::new(), None)
1641    } else {
1642        (
1643            String::new(),
1644            Some(format!(
1645                "PDF text extraction failed after {} attempts: {}",
1646                failures.len(),
1647                failures.join("; ")
1648            )),
1649        )
1650    }
1651}
1652
1653fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1654    !failures.is_empty()
1655        && failures.iter().all(|failure| {
1656            failure.contains("requires a password")
1657                || failure.contains("Encrypt dictionary missing /O")
1658                || failure.contains("Encrypt dictionary missing /U")
1659                || failure.contains("security handler cannot be found")
1660                || failure.contains("Invalid cross-reference table")
1661        })
1662}
1663
1664fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1665    if let Some(message) = payload.downcast_ref::<&str>() {
1666        (*message).to_string()
1667    } else if let Some(message) = payload.downcast_ref::<String>() {
1668        message.clone()
1669    } else {
1670        "unknown panic payload".to_string()
1671    }
1672}
1673
1674fn extract_first_pdf_page_text(
1675    document: &mut pdf_oxide::document::PdfDocument,
1676) -> Result<String, Box<dyn std::error::Error>> {
1677    if document.page_count()? == 0 {
1678        return Ok(String::new());
1679    }
1680
1681    let extracted_text = document.extract_text(0)?;
1682    let markdown_text =
1683        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1684    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1685        return Ok(extracted_text);
1686    }
1687
1688    let pipeline_text =
1689        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1690
1691    Ok(merge_pdf_first_page_text(
1692        &extracted_text,
1693        &markdown_text,
1694        &pipeline_text,
1695    ))
1696}
1697
1698fn extract_pdf_text_from_document(
1699    document: &mut pdf_oxide::document::PdfDocument,
1700) -> Result<String, Box<dyn std::error::Error>> {
1701    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1702}
1703
1704fn normalize_pdf_text(text: String) -> Option<String> {
1705    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1706    (!normalized.trim().is_empty()).then_some(normalized)
1707}
1708
1709fn merge_pdf_first_page_text(
1710    _extracted_text: &str,
1711    markdown_text: &str,
1712    pipeline_text: &str,
1713) -> String {
1714    let pipeline = pipeline_text.trim();
1715    if pipeline.is_empty() {
1716        return String::new();
1717    }
1718
1719    let prefix = pdf_first_page_heading_prefix(markdown_text);
1720    let Some(prefix) = prefix else {
1721        return pipeline_text.to_string();
1722    };
1723
1724    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1725        pipeline_text.to_string()
1726    } else {
1727        format!("{prefix}\n\n{pipeline}")
1728    }
1729}
1730
1731fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1732    normalize_pdf_heading_comparison_text(text)
1733        .contains(&normalize_pdf_heading_comparison_text(prefix))
1734}
1735
1736fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1737    text.split_whitespace()
1738        .map(|part| part.to_ascii_lowercase())
1739        .collect::<Vec<_>>()
1740        .join(" ")
1741}
1742
1743fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1744    let mut lines = Vec::new();
1745
1746    for line in pdf_markdown_heading_lines(markdown_text) {
1747        push_unique_line(&mut lines, line);
1748    }
1749
1750    (!lines.is_empty()).then(|| lines.join("\n"))
1751}
1752
1753fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1754    text.lines()
1755        .map(str::trim)
1756        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1757        .map(|line| line.trim_matches('#').trim())
1758        .filter(|line| !line.is_empty())
1759        .filter(|line| !looks_like_numbered_section_heading(line))
1760        .take(4)
1761        .map(ToOwned::to_owned)
1762        .collect()
1763}
1764
1765fn push_unique_line(lines: &mut Vec<String>, line: String) {
1766    if !lines.iter().any(|existing| existing == &line) {
1767        lines.push(line);
1768    }
1769}
1770
1771fn looks_like_numbered_section_heading(line: &str) -> bool {
1772    let mut chars = line.chars();
1773    let Some(first) = chars.next() else {
1774        return false;
1775    };
1776
1777    if !first.is_ascii_digit() {
1778        return false;
1779    }
1780
1781    matches!(chars.next(), Some('.'))
1782}
1783
1784fn is_zip_archive(bytes: &[u8]) -> bool {
1785    bytes.starts_with(b"PK\x03\x04")
1786        || bytes.starts_with(b"PK\x05\x06")
1787        || bytes.starts_with(b"PK\x07\x08")
1788}
1789
1790pub fn extract_printable_strings(bytes: &[u8]) -> String {
1791    const MIN_LEN: usize = 4;
1792    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1793    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1794
1795    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1796
1797    fn is_printable_ascii(b: u8) -> bool {
1798        matches!(b, 0x20..=0x7E)
1799    }
1800
1801    let mut out = String::new();
1802    let mut run: Vec<u8> = Vec::new();
1803
1804    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1805        if run.len() >= MIN_LEN {
1806            if !out.is_empty() {
1807                out.push('\n');
1808            }
1809            out.push_str(&String::from_utf8_lossy(run));
1810        }
1811        run.clear();
1812    };
1813
1814    for &b in bytes {
1815        if is_printable_ascii(b) {
1816            run.push(b);
1817        } else {
1818            flush_run(&mut out, &mut run);
1819            if out.len() >= max_output_bytes {
1820                return out;
1821            }
1822        }
1823    }
1824    flush_run(&mut out, &mut run);
1825    if out.len() >= max_output_bytes {
1826        return out;
1827    }
1828
1829    for start in 0..=1 {
1830        run.clear();
1831        let mut i = start;
1832        while i + 1 < bytes.len() {
1833            let b0 = bytes[i];
1834            let b1 = bytes[i + 1];
1835            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1836            if is_printable_ascii(ch) && zero == 0 {
1837                run.push(ch);
1838            } else {
1839                flush_run(&mut out, &mut run);
1840                if out.len() >= max_output_bytes {
1841                    return out;
1842                }
1843            }
1844            i += 2;
1845        }
1846        flush_run(&mut out, &mut run);
1847        if out.len() >= max_output_bytes {
1848            return out;
1849        }
1850    }
1851
1852    out
1853}
1854
1855#[cfg(test)]
1856mod tests {
1857    use std::path::Path;
1858
1859    use super::{
1860        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
1861        extract_printable_strings, extract_text_for_detection,
1862        extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
1863        normalize_mime_type, normalize_pdf_heading_comparison_text,
1864        windows_metadata_or_empty_result,
1865    };
1866
1867    #[test]
1868    fn test_extract_text_for_detection_skips_jar_archives() {
1869        let path = Path::new(
1870            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1871        );
1872        let bytes = std::fs::read(path).expect("failed to read jar fixture");
1873
1874        let (text, kind) = extract_text_for_detection(path, &bytes);
1875
1876        assert!(text.is_empty());
1877        assert_eq!(kind, ExtractedTextKind::None);
1878    }
1879
1880    #[test]
1881    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1882        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1883        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1884
1885        let (text, kind) = extract_text_for_detection(path, &bytes);
1886
1887        assert_eq!(kind, ExtractedTextKind::Pdf);
1888        assert!(text.contains("Redistribution and use in source and binary forms"));
1889    }
1890
1891    #[test]
1892    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1893        let path =
1894            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1895        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1896
1897        let (text, kind) = extract_text_for_detection(path, &bytes);
1898
1899        assert_eq!(kind, ExtractedTextKind::Pdf);
1900        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1901        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1902    }
1903
1904    #[test]
1905    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1906        let path =
1907            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1908        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1909
1910        let (text, kind) = extract_text_for_detection(path, &bytes);
1911
1912        assert_eq!(kind, ExtractedTextKind::Pdf);
1913
1914        let normalized = normalize_pdf_heading_comparison_text(&text);
1915        let heading =
1916            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1917        assert_eq!(normalized.matches(&heading).count(), 1);
1918    }
1919
1920    #[test]
1921    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1922        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1923        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1924
1925        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1926
1927        assert_eq!(kind, ExtractedTextKind::Pdf);
1928        assert!(text.contains("Redistribution and use in source and binary forms"));
1929    }
1930
1931    #[test]
1932    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
1933        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
1934
1935        let (text, kind, scan_error) =
1936            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
1937
1938        assert!(text.is_empty());
1939        assert_eq!(kind, ExtractedTextKind::None);
1940        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
1941        assert!(scan_error.contains("PDF text extraction failed after"));
1942    }
1943
1944    #[test]
1945    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
1946        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1947
1948        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
1949
1950        assert!(text.is_empty());
1951        assert_eq!(kind, ExtractedTextKind::None);
1952    }
1953
1954    #[test]
1955    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
1956        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1957        let text = b"Copyright 2026 Example Project!!!";
1958        bytes[..text.len()].copy_from_slice(text);
1959        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1960        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
1961
1962        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
1963
1964        assert_ne!(kind, ExtractedTextKind::None);
1965        assert!(text.contains("Copyright 2026 Example Project"));
1966    }
1967
1968    #[test]
1969    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
1970        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1971        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
1972        bytes[..noise.len()].copy_from_slice(noise);
1973        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1974        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
1975
1976        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
1977
1978        assert!(text.is_empty());
1979        assert_eq!(kind, ExtractedTextKind::None);
1980    }
1981
1982    #[test]
1983    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
1984        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1985        let bytes = std::fs::read(path).expect("read PE fixture");
1986
1987        let (text, kind) = extract_text_for_detection(path, &bytes);
1988
1989        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
1990        assert!(text.contains("License: This program is free software"));
1991        assert!(text.contains("LegalCopyright:"));
1992    }
1993
1994    #[test]
1995    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
1996    {
1997        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1998        let mut bytes = std::fs::read(path).expect("read PE fixture");
1999        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2000
2001        let (text, kind) = extract_text_for_detection(path, &bytes);
2002
2003        assert_ne!(kind, ExtractedTextKind::None);
2004        assert!(!text.trim().is_empty());
2005    }
2006
2007    #[test]
2008    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2009        let (text, kind, scan_error) =
2010            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2011
2012        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2013        assert_eq!(text, "LegalCopyright: Example Corp");
2014        assert!(scan_error.is_none());
2015    }
2016
2017    #[test]
2018    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2019        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2020        let text = b"Copyright 2026 Example Project!!!";
2021        bytes[..text.len()].copy_from_slice(text);
2022
2023        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2024
2025        assert!(text.is_empty());
2026        assert_eq!(kind, ExtractedTextKind::None);
2027    }
2028
2029    #[test]
2030    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2031        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2032        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2033        bytes[..text.len()].copy_from_slice(text);
2034
2035        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2036
2037        assert_ne!(kind, ExtractedTextKind::None);
2038        assert!(text.contains("asn@redhat.com"));
2039        assert!(text.contains("https://publicsuffix.org/"));
2040    }
2041
2042    #[test]
2043    fn test_non_actionable_pdf_failures_are_suppressed() {
2044        assert!(is_non_actionable_pdf_failure(&[
2045            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2046            "open full-document: PDF is encrypted and requires a password".to_string(),
2047        ]));
2048        assert!(is_non_actionable_pdf_failure(&[
2049            "from-bytes first-page: Invalid cross-reference table".to_string(),
2050            "open full-document: Invalid cross-reference table".to_string(),
2051        ]));
2052        assert!(is_non_actionable_pdf_failure(&[
2053            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2054            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2055        ]));
2056        assert!(!is_non_actionable_pdf_failure(&[
2057            "from-bytes first-page: some other parser failure".to_string(),
2058        ]));
2059    }
2060
2061    #[test]
2062    fn test_extract_text_for_detection_skips_zip_like_archives() {
2063        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2064
2065        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2066        let (crate_text, crate_kind) =
2067            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2068
2069        assert!(whl_text.is_empty());
2070        assert_eq!(whl_kind, ExtractedTextKind::None);
2071        assert!(crate_text.is_empty());
2072        assert_eq!(crate_kind, ExtractedTextKind::None);
2073    }
2074
2075    #[test]
2076    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2077        let path =
2078            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2079        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2080
2081        let (text, kind) = extract_text_for_detection(path, &bytes);
2082
2083        assert_ne!(kind, ExtractedTextKind::None);
2084        assert!(text.contains("Copyright nexB and others (c) 2012"));
2085    }
2086
2087    #[test]
2088    fn test_extract_text_for_detection_reads_font_metadata() {
2089        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2090        let bytes = std::fs::read(path).expect("failed to read font fixture");
2091
2092        let (text, kind) = extract_text_for_detection(path, &bytes);
2093
2094        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2095        assert!(text.contains("License Description:"), "{text}");
2096        assert!(
2097            text.contains("Open Font License") || text.contains("OFL"),
2098            "{text}"
2099        );
2100    }
2101
2102    #[test]
2103    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2104        let bytes = b"abcd\0".repeat(525_000);
2105
2106        let text = extract_printable_strings(&bytes);
2107
2108        assert!(
2109            text.len() > 2_000_000,
2110            "unexpected truncation at {}",
2111            text.len()
2112        );
2113        assert!(text.ends_with("abcd"));
2114    }
2115
2116    #[test]
2117    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2118        let path = Path::new(
2119            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2120        );
2121        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2122
2123        let (text, kind) = extract_text_for_detection(path, &bytes);
2124
2125        assert_eq!(kind, ExtractedTextKind::Decoded);
2126        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2127    }
2128
2129    #[test]
2130    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2131        let path = Path::new(
2132            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2133        );
2134        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2135
2136        let (text, kind) = extract_text_for_detection(path, &bytes);
2137
2138        assert_eq!(kind, ExtractedTextKind::Decoded);
2139        assert!(text.contains("GNU Lesser General Public"));
2140        assert!(text.contains("version"));
2141        assert!(text.contains("2.1 of the License"));
2142    }
2143
2144    #[test]
2145    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2146        assert_eq!(
2147            normalize_mime_type(
2148                Path::new("main.ts"),
2149                b"export const answer = 42;\n",
2150                Some("TypeScript"),
2151                "video/mp2t",
2152            ),
2153            "text/plain"
2154        );
2155    }
2156
2157    #[test]
2158    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2159        assert_eq!(
2160            normalize_mime_type(
2161                Path::new("main.js"),
2162                b"console.log('hello');\n",
2163                Some("JavaScript"),
2164                "application/octet-stream",
2165            ),
2166            "text/plain"
2167        );
2168    }
2169
2170    #[test]
2171    fn test_normalize_mime_type_preserves_binary_video_guess() {
2172        assert_eq!(
2173            normalize_mime_type(
2174                Path::new("main.ts"),
2175                &[0, 159, 146, 150, 0, 1, 2, 3],
2176                Some("TypeScript"),
2177                "video/mp2t",
2178            ),
2179            "video/mp2t"
2180        );
2181    }
2182
2183    #[test]
2184    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2185        assert_eq!(
2186            normalize_mime_type(
2187                Path::new("main.ts"),
2188                &[0, 159, 146, 150],
2189                Some("TypeScript"),
2190                "application/octet-stream",
2191            ),
2192            "application/octet-stream"
2193        );
2194    }
2195
2196    #[test]
2197    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2198        let classification = classify_file_info(Path::new("test.txt"), b"");
2199
2200        assert_eq!(classification.mime_type, "inode/x-empty");
2201        assert_eq!(classification.file_type, "empty");
2202        assert!(!classification.is_binary);
2203        assert!(classification.is_text);
2204        assert!(!classification.is_source);
2205        assert_eq!(classification.programming_language, None);
2206    }
2207
2208    #[test]
2209    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2210        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2211
2212        assert_eq!(classification.mime_type, "application/json");
2213        assert_eq!(classification.file_type, "JSON text data");
2214        assert!(classification.is_text);
2215        assert!(!classification.is_source);
2216        assert_eq!(classification.programming_language, None);
2217    }
2218
2219    #[test]
2220    fn test_classify_file_info_treats_dockerfile_as_source() {
2221        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2222
2223        assert_eq!(
2224            classification.programming_language.as_deref(),
2225            Some("Dockerfile")
2226        );
2227        assert!(classification.is_source);
2228        assert!(!classification.is_script);
2229        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2230    }
2231
2232    #[test]
2233    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2234        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2235
2236        assert_eq!(classification.programming_language, None);
2237        assert!(classification.is_text);
2238        assert!(!classification.is_source);
2239        assert!(!classification.is_script);
2240        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2241    }
2242
2243    #[test]
2244    fn test_classify_file_info_marks_supported_package_archives() {
2245        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2246
2247        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2248        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2249
2250        assert!(egg.is_archive);
2251        assert_eq!(egg.mime_type, "application/zip");
2252        assert_eq!(egg.file_type, "Zip archive data");
2253        assert!(nupkg.is_archive);
2254        assert_eq!(nupkg.mime_type, "application/zip");
2255        assert_eq!(nupkg.file_type, "Zip archive data");
2256    }
2257
2258    #[test]
2259    fn test_classify_file_info_marks_png_as_binary_media() {
2260        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2261
2262        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2263
2264        assert_eq!(classification.mime_type, "image/png");
2265        assert_eq!(classification.file_type, "PNG image data");
2266        assert!(classification.is_binary);
2267        assert!(!classification.is_text);
2268        assert!(classification.is_media);
2269        assert!(!classification.is_archive);
2270        assert!(!classification.is_source);
2271    }
2272
2273    #[test]
2274    fn test_classify_file_info_marks_pdf_as_binary_document() {
2275        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2276
2277        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2278
2279        assert_eq!(classification.mime_type, "application/pdf");
2280        assert_eq!(classification.file_type, "PDF document");
2281        assert!(classification.is_binary);
2282        assert!(!classification.is_text);
2283        assert!(!classification.is_archive);
2284        assert!(!classification.is_media);
2285    }
2286
2287    #[test]
2288    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2289        let classification =
2290            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2291
2292        assert!(classification.is_binary);
2293        assert!(!classification.is_text);
2294        assert!(!classification.is_source);
2295        assert_eq!(classification.programming_language, None);
2296    }
2297
2298    #[test]
2299    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2300        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2301
2302        assert_eq!(classification.programming_language, None);
2303        assert!(classification.is_text);
2304        assert!(!classification.is_source);
2305        assert_eq!(classification.file_type, "YAML text data");
2306    }
2307
2308    #[test]
2309    fn test_classify_file_info_classifies_common_build_manifests() {
2310        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2311        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2312        let gitmodules = classify_file_info(
2313            Path::new(".gitmodules"),
2314            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2315        );
2316
2317        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2318        assert!(gradle.is_source);
2319        assert_eq!(gradle.mime_type, "text/plain");
2320
2321        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2322        assert!(flake.is_source);
2323        assert_eq!(flake.mime_type, "text/plain");
2324
2325        assert_eq!(gitmodules.programming_language, None);
2326        assert!(gitmodules.is_text);
2327        assert!(!gitmodules.is_source);
2328        assert_eq!(gitmodules.file_type, "Git configuration text");
2329    }
2330
2331    #[test]
2332    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2333        let classification = classify_file_info(
2334            Path::new("bin/run"),
2335            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2336        );
2337
2338        assert_eq!(
2339            classification.programming_language.as_deref(),
2340            Some("JavaScript")
2341        );
2342        assert!(classification.is_script);
2343        assert_eq!(
2344            classification.file_type,
2345            "javascript script, UTF-8 Unicode text executable"
2346        );
2347    }
2348
2349    #[test]
2350    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2351        let classification = classify_file_info(
2352            Path::new("script.py"),
2353            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2354        );
2355
2356        assert_eq!(
2357            classification.programming_language.as_deref(),
2358            Some("Python")
2359        );
2360        assert!(classification.is_script);
2361        assert_eq!(classification.file_type, "python script, text executable");
2362    }
2363
2364    #[test]
2365    fn test_classify_file_info_treats_textual_tga_as_media() {
2366        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2367
2368        assert!(classification.is_media);
2369        assert!(classification.is_text);
2370        assert!(!classification.is_binary);
2371    }
2372
2373    #[test]
2374    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2375        let classification =
2376            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2377
2378        assert!(classification.is_binary);
2379        assert!(!classification.is_text);
2380        assert!(!classification.is_source);
2381        assert_eq!(classification.programming_language, None);
2382    }
2383
2384    #[test]
2385    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2386        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2387
2388        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2389
2390        assert!(text.is_empty());
2391        assert_eq!(kind, ExtractedTextKind::None);
2392    }
2393
2394    #[test]
2395    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2396        let cases = [
2397            (
2398                Path::new("bin/run"),
2399                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2400                Some("JavaScript"),
2401                true,
2402                true,
2403            ),
2404            (
2405                Path::new("Dockerfile"),
2406                b"FROM scratch\n".as_slice(),
2407                Some("Dockerfile"),
2408                true,
2409                false,
2410            ),
2411            (
2412                Path::new("package.json"),
2413                br#"{"name":"demo"}"#.as_slice(),
2414                None,
2415                false,
2416                false,
2417            ),
2418            (
2419                Path::new("config.yaml"),
2420                b"key: value\n".as_slice(),
2421                None,
2422                false,
2423                false,
2424            ),
2425            (
2426                Path::new("Makefile"),
2427                b"all:\n\techo hi\n".as_slice(),
2428                None,
2429                false,
2430                false,
2431            ),
2432        ];
2433
2434        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2435            let classification = classify_file_info(path, bytes);
2436
2437            assert_eq!(
2438                classification.programming_language.as_deref(),
2439                expected_language,
2440                "unexpected language for {}",
2441                path.display()
2442            );
2443            assert_eq!(
2444                classification.is_source,
2445                expected_is_source,
2446                "unexpected is_source for {}",
2447                path.display()
2448            );
2449            assert_eq!(
2450                classification.is_script,
2451                expected_is_script,
2452                "unexpected is_script for {}",
2453                path.display()
2454            );
2455        }
2456    }
2457}