provenant/utils/
file.rs

1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23    None,
24    Decoded,
25    FontMetadata,
26    Pdf,
27    BinaryStrings,
28    ImageMetadata,
29    WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34    pub mime_type: String,
35    pub file_type: String,
36    pub programming_language: Option<String>,
37    pub is_binary: bool,
38    pub is_text: bool,
39    pub is_archive: bool,
40    pub is_media: bool,
41    pub is_source: bool,
42    pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
50    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
51];
52const BINARY_EXTENSIONS: &[&str] = &[
53    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
54];
55const ARCHIVE_EXTENSIONS: &[&str] = &[
56    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
57    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
58];
59
60/// Get the last modified date of a file as a `YYYY-MM-DD` string.
61pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
62    metadata.modified().ok().map(|time: std::time::SystemTime| {
63        let seconds_since_epoch = time
64            .duration_since(std::time::UNIX_EPOCH)
65            .unwrap()
66            .as_secs() as i64;
67
68        Utc.timestamp_opt(seconds_since_epoch, 0)
69            .single()
70            .unwrap_or_else(Utc::now)
71            .format("%Y-%m-%d")
72            .to_string()
73    })
74}
75
76/// Check if a path should be excluded based on a list of glob patterns.
77pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
78    let path_str = path.to_string_lossy();
79    let file_name = path
80        .file_name()
81        .map(|name| name.to_string_lossy())
82        .unwrap_or_default();
83
84    for pattern in exclude_patterns {
85        // Match against full path
86        if pattern.matches(&path_str) {
87            return true;
88        }
89
90        // Match against just the file/directory name
91        if pattern.matches(&file_name) {
92            return true;
93        }
94    }
95
96    false
97}
98
99/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
100///
101/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
102/// so it can decode any byte sequence. This matches Python ScanCode's use of
103/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
104pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
105    match String::from_utf8(bytes.to_vec()) {
106        Ok(s) => s,
107        Err(e) => {
108            let bytes = e.into_bytes();
109            if has_binary_control_chars(&bytes) {
110                return String::new();
111            }
112            bytes.iter().map(|&b| b as char).collect()
113        }
114    }
115}
116
117pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
118    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
119    (text, kind)
120}
121
122pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
123    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
124        return Cow::Borrowed(text);
125    };
126    if !matches!(
127        extension.to_ascii_lowercase().as_str(),
128        "md" | "markdown" | "html" | "htm"
129    ) {
130        return Cow::Borrowed(text);
131    }
132
133    let mut hints = Vec::new();
134    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
135        hints.push("Creative Commons Attribution 4.0 International License".to_string());
136    }
137    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
138    {
139        hints.push(
140            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
141                .to_string(),
142        );
143    }
144
145    hints.extend(extract_shields_license_badge_hints(text));
146
147    if hints.is_empty() {
148        Cow::Borrowed(text)
149    } else {
150        let mut augmented =
151            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
152        augmented.push_str(text);
153        augmented.push_str("\n\n");
154        for (index, hint) in hints.into_iter().enumerate() {
155            if index > 0 {
156                augmented.push('\n');
157            }
158            augmented.push_str(&hint);
159        }
160        Cow::Owned(augmented)
161    }
162}
163
164fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
165    let mut hints = Vec::new();
166    let mut rest = text;
167    let needle = "img.shields.io/badge/license-";
168
169    while let Some(index) = rest.find(needle) {
170        let start = index + needle.len();
171        let suffix = &rest[start..];
172        let end = suffix
173            .find([')', ']', '"', '\'', ' ', '\n'])
174            .unwrap_or(suffix.len());
175        let badge = &suffix[..end];
176        let Some(badge) = badge.strip_suffix(".svg") else {
177            rest = &suffix[end..];
178            continue;
179        };
180
181        let mut segments: Vec<_> = badge
182            .split('-')
183            .filter(|segment| !segment.is_empty())
184            .collect();
185        if segments.len() < 2 {
186            rest = &suffix[end..];
187            continue;
188        }
189        segments.pop();
190        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
191        if !candidate.is_empty() {
192            hints.push(canonical_shields_license_hint(&candidate));
193        }
194
195        rest = &suffix[end..];
196    }
197
198    hints.sort();
199    hints.dedup();
200    hints
201}
202
203fn canonical_shields_license_hint(candidate: &str) -> String {
204    match candidate.trim() {
205        "MIT" => "The MIT License".to_string(),
206        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
207        other => format!("{other} License"),
208    }
209}
210
211pub(crate) fn extract_text_for_detection_with_diagnostics(
212    path: &Path,
213    bytes: &[u8],
214) -> (String, ExtractedTextKind, Option<String>) {
215    let ext = path
216        .extension()
217        .and_then(|e| e.to_str())
218        .map(|s| s.to_ascii_lowercase());
219    let detected_format = detect_file_format(bytes);
220
221    if looks_like_rtf(bytes, ext.as_deref()) {
222        let text = extract_rtf_text(bytes);
223        return if text.trim().is_empty() {
224            (String::new(), ExtractedTextKind::None, None)
225        } else {
226            (text, ExtractedTextKind::Decoded, None)
227        };
228    }
229
230    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
231        let (text, scan_error) = extract_pdf_text(path, bytes);
232        return if text.is_empty() {
233            (String::new(), ExtractedTextKind::None, scan_error)
234        } else {
235            (text, ExtractedTextKind::Pdf, None)
236        };
237    }
238
239    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
240        let text = extract_image_metadata_text(bytes, format);
241        return if text.is_empty() {
242            if is_supported_image_container(bytes, format) {
243                (String::new(), ExtractedTextKind::None, None)
244            } else {
245                let decoded = decode_bytes_to_string(bytes);
246                if decoded.is_empty() {
247                    (String::new(), ExtractedTextKind::None, None)
248                } else {
249                    (decoded, ExtractedTextKind::Decoded, None)
250                }
251            }
252        } else {
253            (text, ExtractedTextKind::ImageMetadata, None)
254        };
255    }
256
257    if let Some(text) = extract_font_metadata_text(path, bytes) {
258        return (text, ExtractedTextKind::FontMetadata, None);
259    }
260
261    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
262
263    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
264        return (String::new(), ExtractedTextKind::None, None);
265    }
266
267    if should_skip_binary_string_extraction(path, bytes, detected_format) {
268        return (String::new(), ExtractedTextKind::None, None);
269    }
270
271    let decoded = decode_bytes_to_string(bytes);
272    if !decoded.is_empty() {
273        let combined = combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
274        return (combined, ExtractedTextKind::Decoded, None);
275    }
276
277    let text = extract_printable_strings(bytes);
278    if text.is_empty() {
279        if let Some(metadata_text) = windows_executable_metadata_text {
280            (
281                metadata_text,
282                ExtractedTextKind::WindowsExecutableMetadata,
283                None,
284            )
285        } else {
286            (String::new(), ExtractedTextKind::None, None)
287        }
288    } else {
289        (
290            combine_extracted_text_fragments(windows_executable_metadata_text, text),
291            ExtractedTextKind::BinaryStrings,
292            None,
293        )
294    }
295}
296
297fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
298    match prefix {
299        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
300        Some(prefix) if !prefix.is_empty() => prefix,
301        _ => suffix,
302    }
303}
304
305pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
306    let detected_format = detect_file_format(bytes);
307    let detected_language = detect_language(path, bytes);
308    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
309    let is_text = !is_binary;
310    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
311    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
312    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
313    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
314    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
315    let programming_language = is_source.then(|| detected_language.clone()).flatten();
316    let file_type = detect_file_type(
317        path,
318        bytes,
319        detected_format,
320        &mime_type,
321        programming_language.as_deref(),
322        is_binary,
323        is_text,
324        is_archive,
325        is_media,
326        is_script,
327    );
328
329    FileInfoClassification {
330        mime_type,
331        file_type,
332        programming_language,
333        is_binary,
334        is_text,
335        is_archive,
336        is_media,
337        is_source,
338        is_script,
339    }
340}
341
342fn detect_file_format(bytes: &[u8]) -> FileFormat {
343    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
344}
345
346fn is_utf8_text(bytes: &[u8]) -> bool {
347    std::str::from_utf8(bytes).is_ok()
348}
349
350fn has_binary_control_chars(bytes: &[u8]) -> bool {
351    let control_count = bytes
352        .iter()
353        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
354        .count();
355    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
356}
357
358fn has_decodable_text(bytes: &[u8]) -> bool {
359    bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
360}
361
362fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
363    if bytes.is_empty() || is_utf8_text(bytes) {
364        return true;
365    }
366
367    let printable_count = bytes
368        .iter()
369        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
370        .count();
371    printable_count * 2 >= bytes.len()
372}
373
374fn is_textual_media_type(media_type: &str) -> bool {
375    media_type.starts_with("text/")
376        || matches!(
377            media_type,
378            "application/json" | "application/xml" | "text/xml"
379        )
380        || media_type.ends_with("+json")
381        || media_type.ends_with("+xml")
382}
383
384fn is_textual_format(detected_format: FileFormat) -> bool {
385    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
386        || is_textual_media_type(detected_format.media_type())
387}
388
389fn is_known_binary_format(detected_format: FileFormat) -> bool {
390    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
391        && !is_textual_format(detected_format)
392}
393
394pub fn detect_mime_type(
395    path: &Path,
396    bytes: &[u8],
397    detected_format: FileFormat,
398    programming_language: Option<&str>,
399) -> String {
400    if bytes.is_empty() {
401        return "inode/x-empty".to_string();
402    }
403
404    if is_zip_archive(bytes) {
405        return detect_zip_like_mime(path);
406    }
407
408    if looks_like_deb(bytes, path) {
409        return "application/vnd.debian.binary-package".to_string();
410    }
411
412    if looks_like_rpm(bytes, path) {
413        return "application/x-rpm".to_string();
414    }
415
416    let guessed_mime = from_path(path)
417        .first_or_octet_stream()
418        .essence_str()
419        .to_string();
420
421    let mime_type = match detected_format {
422        FileFormat::Empty => "inode/x-empty".to_string(),
423        FileFormat::PlainText => {
424            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
425                "text/plain".to_string()
426            } else {
427                guessed_mime.clone()
428            }
429        }
430        _ => {
431            let detected_mime = detected_format.media_type();
432            if detected_mime == "application/octet-stream"
433                && guessed_mime != "application/octet-stream"
434            {
435                guessed_mime.clone()
436            } else {
437                detected_mime.to_string()
438            }
439        }
440    };
441
442    normalize_mime_type(path, bytes, programming_language, &mime_type)
443}
444
445fn normalize_mime_type(
446    path: &Path,
447    bytes: &[u8],
448    programming_language: Option<&str>,
449    mime_type: &str,
450) -> String {
451    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
452        return "text/plain".to_string();
453    }
454
455    mime_type.to_string()
456}
457
458fn should_prefer_text_mime(
459    path: &Path,
460    bytes: &[u8],
461    programming_language: Option<&str>,
462    mime_type: &str,
463) -> bool {
464    has_decodable_text(bytes)
465        && looks_like_textual_bytes(bytes)
466        && is_textual_source_candidate(path, programming_language)
467        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
468}
469
470fn detect_is_binary(
471    path: &Path,
472    bytes: &[u8],
473    detected_format: FileFormat,
474    programming_language: Option<&str>,
475) -> bool {
476    if is_textual_format(detected_format) {
477        return false;
478    }
479
480    if lower_extension(path)
481        .as_deref()
482        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
483    {
484        return true;
485    }
486
487    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
488        return false;
489    }
490
491    has_binary_control_chars(bytes)
492        || is_known_binary_format(detected_format)
493        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
494            && !looks_like_textual_bytes(bytes))
495}
496
497fn should_treat_binary_bytes_as_text(
498    path: &Path,
499    bytes: &[u8],
500    programming_language: Option<&str>,
501) -> bool {
502    has_decodable_text(bytes)
503        && looks_like_textual_bytes(bytes)
504        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
505}
506
507fn detect_is_archive(
508    path: &Path,
509    bytes: &[u8],
510    mime_type: &str,
511    is_text: bool,
512    detected_format: FileFormat,
513) -> bool {
514    if is_text {
515        return false;
516    }
517
518    lower_extension(path)
519        .as_deref()
520        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
521        || matches!(
522            detected_format.kind(),
523            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
524        )
525        || is_zip_archive(bytes)
526        || looks_like_gzip(bytes)
527        || looks_like_bzip2(bytes)
528        || looks_like_xz(bytes)
529        || looks_like_deb(bytes, path)
530        || looks_like_rpm(bytes, path)
531        || looks_like_squashfs(bytes, path)
532        || mime_type.contains("zip")
533        || mime_type.contains("compressed")
534        || mime_type.contains("tar")
535        || mime_type.contains("x-rpm")
536        || mime_type.contains("debian")
537}
538
539fn detect_is_media(
540    path: &Path,
541    bytes: &[u8],
542    mime_type: &str,
543    detected_format: FileFormat,
544) -> bool {
545    media_mime_from_content(bytes).is_some()
546        || matches!(
547            detected_format.kind(),
548            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
549        )
550        || mime_type.starts_with("image/")
551        || mime_type.starts_with("audio/")
552        || mime_type.starts_with("video/")
553        || (mime_type == "application/octet-stream"
554            && lower_extension(path).as_deref() == Some("tga")
555            && !has_binary_control_chars(bytes))
556}
557
558fn detect_is_script(
559    path: &Path,
560    bytes: &[u8],
561    programming_language: Option<&str>,
562    is_text: bool,
563) -> bool {
564    if !is_text || is_makefile(path) {
565        return false;
566    }
567
568    bytes.starts_with(b"#!")
569        || lower_extension(path).as_deref().is_some_and(|ext| {
570            matches!(
571                ext,
572                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
573            )
574        })
575        || matches!(
576            programming_language,
577            Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
578        )
579}
580
581fn detect_is_source(
582    path: &Path,
583    programming_language: Option<&str>,
584    is_text: bool,
585    is_script: bool,
586) -> bool {
587    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
588        return false;
589    }
590
591    if is_c_like_source(path) || is_java_like_source(path) {
592        return true;
593    }
594
595    programming_language.is_some() || is_script
596}
597
598#[allow(clippy::too_many_arguments)]
599fn detect_file_type(
600    path: &Path,
601    bytes: &[u8],
602    detected_format: FileFormat,
603    mime_type: &str,
604    programming_language: Option<&str>,
605    is_binary: bool,
606    is_text: bool,
607    is_archive: bool,
608    is_media: bool,
609    is_script: bool,
610) -> String {
611    if bytes.is_empty() {
612        return "empty".to_string();
613    }
614
615    if looks_like_pdf(bytes) {
616        return "PDF document".to_string();
617    }
618
619    if let Some(file_type) = media_file_type_from_content(bytes) {
620        return file_type.to_string();
621    }
622
623    if is_archive {
624        return archive_file_type(path, bytes, detected_format);
625    }
626
627    if is_script {
628        return script_file_type(programming_language, bytes);
629    }
630
631    if is_text {
632        if lower_extension(path).as_deref() == Some("json") {
633            return "JSON text data".to_string();
634        }
635        if lower_extension(path).as_deref() == Some("xml") {
636            return "XML text data".to_string();
637        }
638        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
639            return "YAML text data".to_string();
640        }
641        if lower_extension(path).as_deref() == Some("toml") {
642            return "TOML text data".to_string();
643        }
644        if matches!(
645            lower_extension(path).as_deref(),
646            Some("ini" | "cfg" | "conf")
647        ) {
648            return "INI text data".to_string();
649        }
650        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
651            return "Git configuration text".to_string();
652        }
653        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
654            return text_file_type(bytes);
655        }
656        if programming_language.is_some() && !is_media {
657            return text_file_type(bytes);
658        }
659        return text_file_type(bytes);
660    }
661
662    if let Some(file_type) = format_based_file_type(detected_format) {
663        return file_type;
664    }
665
666    if is_binary && mime_type == "application/octet-stream" {
667        return "data".to_string();
668    }
669
670    mime_type.to_string()
671}
672
673fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
674    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
675        return true;
676    }
677
678    if matches!(
679        lower_file_name(path).as_str(),
680        "dockerfile"
681            | "containerfile"
682            | "containerfile.core"
683            | "apkbuild"
684            | "podfile"
685            | "meson.build"
686            | "build"
687            | "workspace"
688            | "buck"
689            | "default.nix"
690            | "flake.nix"
691            | "shell.nix"
692    ) {
693        return true;
694    }
695
696    path.extension()
697        .and_then(|ext| ext.to_str())
698        .is_some_and(|ext| {
699            matches!(
700                ext.to_ascii_lowercase().as_str(),
701                "rs" | "py"
702                    | "js"
703                    | "mjs"
704                    | "cjs"
705                    | "jsx"
706                    | "ts"
707                    | "mts"
708                    | "cts"
709                    | "tsx"
710                    | "c"
711                    | "cpp"
712                    | "cc"
713                    | "cxx"
714                    | "h"
715                    | "hpp"
716                    | "m"
717                    | "mm"
718                    | "s"
719                    | "asm"
720                    | "java"
721                    | "go"
722                    | "rb"
723                    | "php"
724                    | "pl"
725                    | "swift"
726                    | "sh"
727                    | "bash"
728                    | "zsh"
729                    | "fish"
730                    | "ksh"
731                    | "ps1"
732                    | "psm1"
733                    | "psd1"
734                    | "awk"
735                    | "kt"
736                    | "kts"
737                    | "dart"
738                    | "scala"
739                    | "groovy"
740                    | "gradle"
741                    | "gvy"
742                    | "gy"
743                    | "gsh"
744                    | "cs"
745                    | "fs"
746                    | "fsx"
747                    | "r"
748                    | "lua"
749                    | "jl"
750                    | "ex"
751                    | "exs"
752                    | "clj"
753                    | "cljs"
754                    | "cljc"
755                    | "hs"
756                    | "erl"
757                    | "nix"
758                    | "zig"
759                    | "bzl"
760                    | "bazel"
761                    | "star"
762                    | "sky"
763                    | "ml"
764                    | "mli"
765                    | "tex"
766            )
767        })
768}
769
770fn is_source_like_language(language: &str) -> bool {
771    matches!(
772        language,
773        "Rust"
774            | "Python"
775            | "JavaScript"
776            | "TypeScript"
777            | "JavaScript/TypeScript"
778            | "C"
779            | "C++"
780            | "Objective-C"
781            | "Objective-C++"
782            | "GAS"
783            | "Java"
784            | "Go"
785            | "Ruby"
786            | "PHP"
787            | "Perl"
788            | "Swift"
789            | "Shell"
790            | "PowerShell"
791            | "Awk"
792            | "Kotlin"
793            | "Dart"
794            | "Scala"
795            | "C#"
796            | "F#"
797            | "R"
798            | "Lua"
799            | "Julia"
800            | "Elixir"
801            | "Clojure"
802            | "Haskell"
803            | "Erlang"
804            | "Groovy"
805            | "Nix"
806            | "Zig"
807            | "Starlark"
808            | "OCaml"
809            | "Meson"
810            | "TeX"
811            | "Dockerfile"
812            | "Makefile"
813    )
814}
815
816fn extension(path: &Path) -> Option<&str> {
817    path.extension().and_then(|ext| ext.to_str())
818}
819
820fn lower_extension(path: &Path) -> Option<String> {
821    extension(path).map(|ext| ext.to_ascii_lowercase())
822}
823
824fn lower_file_name(path: &Path) -> String {
825    path.file_name()
826        .and_then(|name| name.to_str())
827        .map(|name| name.to_ascii_lowercase())
828        .unwrap_or_default()
829}
830
831fn is_plain_text(path: &Path) -> bool {
832    lower_extension(path)
833        .as_deref()
834        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
835}
836
837fn is_makefile(path: &Path) -> bool {
838    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
839}
840
841fn is_source_map(path: &Path) -> bool {
842    let path_lower = path.to_string_lossy().to_ascii_lowercase();
843    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
844}
845
846fn is_c_like_source(path: &Path) -> bool {
847    lower_extension(path).as_deref().is_some_and(|ext| {
848        matches!(
849            ext,
850            "c" | "cc"
851                | "cp"
852                | "cpp"
853                | "cxx"
854                | "c++"
855                | "h"
856                | "hh"
857                | "hpp"
858                | "hxx"
859                | "h++"
860                | "i"
861                | "ii"
862                | "m"
863                | "s"
864                | "asm"
865        )
866    })
867}
868
869fn is_java_like_source(path: &Path) -> bool {
870    lower_extension(path)
871        .as_deref()
872        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
873}
874
875fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
876    match detected_format {
877        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
878        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
879        format => Some(match format.kind() {
880            FileFormatKind::Image => short_name_or_name(&format, "image data"),
881            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
882            FileFormatKind::Video => short_name_or_name(&format, "video data"),
883            _ => format.name().to_string(),
884        }),
885    }
886}
887
888fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
889    format
890        .short_name()
891        .map(|short_name| format!("{short_name} {suffix}"))
892        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
893}
894
895fn detect_zip_like_mime(path: &Path) -> String {
896    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
897        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
898        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
899            "application/java-archive".to_string()
900        }
901        _ => "application/zip".to_string(),
902    }
903}
904
905fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
906    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
907        Some("image/png")
908    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
909        Some("image/jpeg")
910    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
911        Some("image/tiff")
912    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
913        Some("image/webp")
914    } else {
915        None
916    }
917}
918
919fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
920    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
921        Some("PNG image data")
922    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
923        Some("JPEG image data")
924    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
925        Some("TIFF image data")
926    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
927        Some("WebP image data")
928    } else {
929        None
930    }
931}
932
933fn looks_like_pdf(bytes: &[u8]) -> bool {
934    bytes.starts_with(b"%PDF-")
935}
936
937fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
938    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
939}
940
941fn extract_rtf_text(bytes: &[u8]) -> String {
942    let text = String::from_utf8_lossy(bytes);
943    let chars: Vec<char> = text.chars().collect();
944    let mut output = String::new();
945    let mut index = 0usize;
946
947    while index < chars.len() {
948        match chars[index] {
949            '{' | '}' => {
950                index += 1;
951            }
952            '\\' => {
953                index += 1;
954                if index >= chars.len() {
955                    break;
956                }
957
958                match chars[index] {
959                    '\\' | '{' | '}' => {
960                        output.push(chars[index]);
961                        index += 1;
962                    }
963                    '\'' => {
964                        if index + 2 < chars.len() {
965                            let hex = [chars[index + 1], chars[index + 2]];
966                            let hex: String = hex.iter().collect();
967                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
968                                output.push(value as char);
969                                index += 3;
970                                continue;
971                            }
972                        }
973                        index += 1;
974                    }
975                    control if control.is_ascii_alphabetic() => {
976                        let start = index;
977                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
978                            index += 1;
979                        }
980                        let control_word: String = chars[start..index].iter().collect();
981
982                        let number_start = index;
983                        if index < chars.len()
984                            && (chars[index] == '-' || chars[index].is_ascii_digit())
985                        {
986                            index += 1;
987                            while index < chars.len() && chars[index].is_ascii_digit() {
988                                index += 1;
989                            }
990                        }
991                        let parameter: String = chars[number_start..index].iter().collect();
992
993                        if index < chars.len() && chars[index] == ' ' {
994                            index += 1;
995                        }
996
997                        match control_word.as_str() {
998                            "par" | "line" => output.push('\n'),
999                            "tab" => output.push('\t'),
1000                            "emdash" => output.push('—'),
1001                            "endash" => output.push('–'),
1002                            "bullet" => output.push('•'),
1003                            "lquote" | "rquote" => output.push('\''),
1004                            "ldblquote" | "rdblquote" => output.push('"'),
1005                            "u" => {
1006                                if let Ok(codepoint) = parameter.parse::<i32>() {
1007                                    let normalized = if codepoint < 0 {
1008                                        codepoint + 65_536
1009                                    } else {
1010                                        codepoint
1011                                    };
1012                                    if let Ok(normalized) = u32::try_from(normalized)
1013                                        && let Some(ch) = char::from_u32(normalized)
1014                                    {
1015                                        output.push(ch);
1016                                    }
1017                                }
1018
1019                                if index < chars.len()
1020                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1021                                {
1022                                    index += 1;
1023                                }
1024                            }
1025                            _ => {}
1026                        }
1027                    }
1028                    _ => {
1029                        index += 1;
1030                    }
1031                }
1032            }
1033            ch => {
1034                output.push(ch);
1035                index += 1;
1036            }
1037        }
1038    }
1039
1040    output
1041        .replace(['\r', '\u{0c}'], "\n")
1042        .lines()
1043        .map(str::trim_end)
1044        .collect::<Vec<_>>()
1045        .join("\n")
1046}
1047
1048fn looks_like_gzip(bytes: &[u8]) -> bool {
1049    bytes.starts_with(&[0x1f, 0x8b])
1050}
1051
1052fn looks_like_bzip2(bytes: &[u8]) -> bool {
1053    bytes.starts_with(b"BZh")
1054}
1055
1056fn looks_like_xz(bytes: &[u8]) -> bool {
1057    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1058}
1059
1060fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1061    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1062}
1063
1064fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1065    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1066}
1067
1068fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1069    lower_extension(path)
1070        .as_deref()
1071        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1072        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1073            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1074}
1075
1076fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1077    if looks_like_deb(bytes, path) {
1078        "debian binary package (format 2.0)".to_string()
1079    } else if looks_like_rpm(bytes, path) {
1080        "RPM package".to_string()
1081    } else if looks_like_squashfs(bytes, path) {
1082        "Squashfs filesystem".to_string()
1083    } else if looks_like_gzip(bytes) {
1084        "gzip compressed data".to_string()
1085    } else if looks_like_bzip2(bytes) {
1086        "bzip2 compressed data".to_string()
1087    } else if looks_like_xz(bytes) {
1088        "XZ compressed data".to_string()
1089    } else if is_zip_archive(bytes) {
1090        "Zip archive data".to_string()
1091    } else if lower_extension(path).as_deref() == Some("gem") {
1092        "POSIX tar archive".to_string()
1093    } else if let Some(file_type) = format_based_file_type(detected_format) {
1094        file_type
1095    } else {
1096        "archive data".to_string()
1097    }
1098}
1099
1100fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1101    let suffix = text_executable_label(bytes);
1102
1103    match programming_language {
1104        Some("Python") => format!("python script, {suffix}"),
1105        Some("Ruby") => format!("ruby script, {suffix}"),
1106        Some("Perl") => format!("perl script, {suffix}"),
1107        Some("PHP") => format!("php script, {suffix}"),
1108        Some("Shell") => format!("shell script, {suffix}"),
1109        Some("JavaScript") => format!("javascript script, {suffix}"),
1110        Some("TypeScript") => format!("typescript script, {suffix}"),
1111        Some("PowerShell") => format!("powershell script, {suffix}"),
1112        Some("Awk") => format!("awk script, {suffix}"),
1113        _ => format!("script, {suffix}"),
1114    }
1115}
1116
1117fn text_file_type(bytes: &[u8]) -> String {
1118    text_label(bytes).to_string()
1119}
1120
1121fn text_label(bytes: &[u8]) -> &'static str {
1122    if std::str::from_utf8(bytes).is_ok() {
1123        if bytes.contains(&b'\n') {
1124            "UTF-8 Unicode text"
1125        } else {
1126            "UTF-8 Unicode text, with no line terminators"
1127        }
1128    } else if bytes.contains(&b'\n') {
1129        "text"
1130    } else {
1131        "text, with no line terminators"
1132    }
1133}
1134
1135fn text_executable_label(bytes: &[u8]) -> &'static str {
1136    if std::str::from_utf8(bytes).is_ok() {
1137        if bytes.contains(&b'\n') {
1138            "UTF-8 Unicode text executable"
1139        } else {
1140            "UTF-8 Unicode text executable, with no line terminators"
1141        }
1142    } else if bytes.contains(&b'\n') {
1143        "text executable"
1144    } else {
1145        "text executable, with no line terminators"
1146    }
1147}
1148
1149fn supported_image_metadata_format(
1150    ext: Option<&str>,
1151    detected_format: FileFormat,
1152) -> Option<ImageFormat> {
1153    match ext {
1154        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1155        Some("png") => Some(ImageFormat::Png),
1156        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1157        Some("webp") => Some(ImageFormat::WebP),
1158        _ => match detected_format.media_type() {
1159            "image/jpeg" => Some(ImageFormat::Jpeg),
1160            "image/png" => Some(ImageFormat::Png),
1161            "image/tiff" => Some(ImageFormat::Tiff),
1162            "image/webp" => Some(ImageFormat::WebP),
1163            _ => None,
1164        },
1165    }
1166}
1167
1168fn should_skip_binary_string_extraction(
1169    path: &Path,
1170    bytes: &[u8],
1171    detected_format: FileFormat,
1172) -> bool {
1173    matches!(lower_extension(path).as_deref(), Some("pdf"))
1174        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1175            .is_some()
1176        || (matches!(
1177            detected_format.kind(),
1178            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1179        ) && !is_textual_format(detected_format))
1180        || media_mime_from_content(bytes).is_some()
1181        || is_zip_archive(bytes)
1182        || looks_like_gzip(bytes)
1183        || looks_like_bzip2(bytes)
1184        || looks_like_xz(bytes)
1185        || looks_like_deb(bytes, path)
1186        || looks_like_rpm(bytes, path)
1187        || looks_like_squashfs(bytes, path)
1188}
1189
1190fn should_skip_large_opaque_binary_text_extraction(
1191    _path: &Path,
1192    bytes: &[u8],
1193    detected_format: FileFormat,
1194) -> bool {
1195    if bytes.len() < LARGE_OPAQUE_BINARY_SKIP_BYTES {
1196        return false;
1197    }
1198
1199    if !matches!(detected_format, FileFormat::ArbitraryBinaryData) {
1200        return false;
1201    }
1202
1203    if !has_binary_control_chars(bytes) {
1204        return false;
1205    }
1206
1207    !sample_has_promising_printable_strings(bytes)
1208}
1209
1210fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1211    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1212    const MIN_PROMISING_RUN: usize = 16;
1213    const MIN_PROMISING_WINDOWS: usize = 2;
1214
1215    let len = bytes.len();
1216    let mut windows = Vec::new();
1217    windows.push(&bytes[..bytes.len().min(SAMPLE_WINDOW_BYTES)]);
1218    if len > SAMPLE_WINDOW_BYTES * 2 {
1219        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1220        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1221        windows.push(&bytes[mid_start..mid_end]);
1222    }
1223    if len > SAMPLE_WINDOW_BYTES {
1224        windows.push(&bytes[len - SAMPLE_WINDOW_BYTES..]);
1225    }
1226
1227    let promising_windows = windows
1228        .iter()
1229        .filter(|window| has_promising_printable_run(window, MIN_PROMISING_RUN))
1230        .count();
1231
1232    promising_windows >= MIN_PROMISING_WINDOWS
1233        || windows
1234            .iter()
1235            .any(|window| has_strong_structured_text_signal(window))
1236}
1237
1238fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1239    let strings = extract_printable_strings(bytes);
1240    if strings.is_empty() {
1241        return false;
1242    }
1243
1244    let email_markers = strings.matches('@').count();
1245    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1246
1247    email_markers + url_markers >= 3
1248}
1249
1250fn has_promising_printable_run(bytes: &[u8], min_run: usize) -> bool {
1251    longest_printable_ascii_run(bytes) >= min_run
1252        || longest_utf16le_printable_ascii_run(bytes) >= min_run
1253        || longest_utf16be_printable_ascii_run(bytes) >= min_run
1254}
1255
1256fn longest_printable_ascii_run(bytes: &[u8]) -> usize {
1257    bytes
1258        .iter()
1259        .fold((0, 0), |(best, current), &byte| {
1260            if matches!(byte, 0x20..=0x7E) {
1261                let next = current + 1;
1262                (best.max(next), next)
1263            } else {
1264                (best, 0)
1265            }
1266        })
1267        .0
1268}
1269
1270fn longest_utf16le_printable_ascii_run(bytes: &[u8]) -> usize {
1271    longest_utf16_printable_ascii_run(bytes, true)
1272}
1273
1274fn longest_utf16be_printable_ascii_run(bytes: &[u8]) -> usize {
1275    longest_utf16_printable_ascii_run(bytes, false)
1276}
1277
1278fn longest_utf16_printable_ascii_run(bytes: &[u8], little_endian: bool) -> usize {
1279    let mut best = 0;
1280    let mut current = 0;
1281    let start = usize::from(!little_endian);
1282    let mut index = start;
1283    while index + 1 < bytes.len() {
1284        let (ch, zero) = if little_endian {
1285            (bytes[index], bytes[index + 1])
1286        } else {
1287            (bytes[index + 1], bytes[index])
1288        };
1289        if matches!(ch, 0x20..=0x7E) && zero == 0 {
1290            current += 1;
1291            best = best.max(current);
1292        } else {
1293            current = 0;
1294        }
1295        index += 2;
1296    }
1297    best
1298}
1299
1300fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1301    match format {
1302        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1303        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1304        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1305        ImageFormat::WebP => {
1306            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1307        }
1308        _ => false,
1309    }
1310}
1311
1312fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1313    let mut values = Vec::new();
1314    values.extend(extract_exif_metadata_values(bytes));
1315    values.extend(extract_xmp_metadata_values(bytes, format));
1316    values_to_text(values)
1317}
1318
1319fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1320    let mut cursor = BufReader::new(Cursor::new(bytes));
1321    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1322        Ok(exif) => exif,
1323        Err(_) => return Vec::new(),
1324    };
1325
1326    let mut values = Vec::new();
1327    for field in exif.fields() {
1328        let rendered = match field.tag {
1329            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1330                Some(field.display_value().with_unit(&exif).to_string())
1331            }
1332            exif::Tag::Artist => Some(format!(
1333                "Author: {}",
1334                field.display_value().with_unit(&exif)
1335            )),
1336            _ => None,
1337        };
1338
1339        if let Some(rendered) = rendered {
1340            values.push(rendered);
1341        }
1342    }
1343
1344    values
1345}
1346
1347fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1348    let xmp = match extract_raw_xmp_packet(bytes, format) {
1349        Some(xmp) => xmp,
1350        None => return Vec::new(),
1351    };
1352
1353    parse_xmp_values(&xmp)
1354}
1355
1356fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1357    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1358    if let Ok(mut decoder) = reader.into_decoder()
1359        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1360    {
1361        return Some(xmp);
1362    }
1363
1364    match format {
1365        ImageFormat::Png => extract_png_xmp_packet(bytes),
1366        _ => None,
1367    }
1368}
1369
1370fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1371    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1372
1373    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1374        return None;
1375    }
1376
1377    let mut offset = PNG_SIGNATURE.len();
1378    while offset + 12 <= bytes.len() {
1379        let length = u32::from_be_bytes([
1380            bytes[offset],
1381            bytes[offset + 1],
1382            bytes[offset + 2],
1383            bytes[offset + 3],
1384        ]) as usize;
1385        let chunk_start = offset + 8;
1386        let chunk_end = chunk_start + length;
1387        if chunk_end + 4 > bytes.len() {
1388            return None;
1389        }
1390
1391        let chunk_type = &bytes[offset + 4..offset + 8];
1392        if chunk_type == b"iTXt" {
1393            let data = &bytes[chunk_start..chunk_end];
1394            if let Some(xmp) = parse_png_itxt_xmp(data) {
1395                return Some(xmp);
1396            }
1397        }
1398
1399        offset = chunk_end + 4;
1400    }
1401
1402    None
1403}
1404
1405fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1406    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1407
1408    let keyword_end = data.iter().position(|&b| b == 0)?;
1409    if &data[..keyword_end] != XMP_KEYWORD {
1410        return None;
1411    }
1412
1413    let mut cursor = keyword_end + 1;
1414    let compression_flag = *data.get(cursor)?;
1415    cursor += 1;
1416    let compression_method = *data.get(cursor)?;
1417    cursor += 1;
1418    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1419        return None;
1420    }
1421
1422    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1423    cursor = language_end + 1;
1424
1425    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1426    cursor = translated_end + 1;
1427
1428    let text_bytes = &data[cursor..];
1429    if compression_flag == 1 {
1430        let mut decoder = ZlibDecoder::new(text_bytes);
1431        let mut decoded = Vec::new();
1432        decoder.read_to_end(&mut decoded).ok()?;
1433        Some(decoded)
1434    } else {
1435        Some(text_bytes.to_vec())
1436    }
1437}
1438
1439fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1440    let mut reader = XmlReader::from_reader(xmp);
1441    reader.config_mut().trim_text(true);
1442
1443    let mut buf = Vec::new();
1444    let mut stack: Vec<String> = Vec::new();
1445    let mut values = Vec::new();
1446
1447    loop {
1448        match reader.read_event_into(&mut buf) {
1449            Ok(Event::Start(e)) => {
1450                stack.push(local_xml_name(e.name().as_ref()));
1451            }
1452            Ok(Event::End(_)) => {
1453                stack.pop();
1454            }
1455            Ok(Event::Empty(_)) => {}
1456            Ok(Event::Text(text)) => {
1457                if let Some(field) = stack
1458                    .iter()
1459                    .rev()
1460                    .find_map(|name| allowed_xmp_field(name.as_str()))
1461                    && let Ok(decoded) = text.decode()
1462                {
1463                    let decoded = decoded.into_owned();
1464                    if !decoded.trim().is_empty() {
1465                        values.push(format_xmp_value(field, &decoded));
1466                    }
1467                }
1468            }
1469            Ok(Event::CData(text)) => {
1470                if let Some(field) = stack
1471                    .iter()
1472                    .rev()
1473                    .find_map(|name| allowed_xmp_field(name.as_str()))
1474                    && let Ok(decoded) = text.decode()
1475                {
1476                    let decoded = decoded.into_owned();
1477                    if !decoded.trim().is_empty() {
1478                        values.push(format_xmp_value(field, &decoded));
1479                    }
1480                }
1481            }
1482            Ok(Event::Eof) | Err(_) => break,
1483            _ => {}
1484        }
1485        buf.clear();
1486    }
1487
1488    values
1489}
1490
1491fn local_xml_name(name: &[u8]) -> String {
1492    let name = std::str::from_utf8(name).unwrap_or_default();
1493    name.rsplit(':').next().unwrap_or(name).to_string()
1494}
1495
1496fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1497    match name {
1498        "creator" => Some("creator"),
1499        "rights" => Some("rights"),
1500        "description" => Some("description"),
1501        "title" => Some("title"),
1502        "subject" => Some("subject"),
1503        "UsageTerms" => Some("usage_terms"),
1504        "WebStatement" => Some("web_statement"),
1505        _ => None,
1506    }
1507}
1508
1509fn format_xmp_value(field: &str, value: &str) -> String {
1510    match field {
1511        "creator" => format!("Author: {value}"),
1512        _ => value.to_string(),
1513    }
1514}
1515
1516fn values_to_text(values: Vec<String>) -> String {
1517    let mut seen = BTreeSet::new();
1518    let mut lines = Vec::new();
1519    let mut total_bytes = 0usize;
1520
1521    for value in values {
1522        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1523            break;
1524        }
1525
1526        let normalized = normalize_metadata_value(&value);
1527        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1528            continue;
1529        }
1530
1531        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1532        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1533            break;
1534        }
1535
1536        total_bytes += added_bytes;
1537        lines.push(normalized);
1538    }
1539
1540    lines.join("\n")
1541}
1542
1543fn normalize_metadata_value(value: &str) -> String {
1544    value
1545        .chars()
1546        .filter(|&ch| ch != '\0')
1547        .collect::<String>()
1548        .split_whitespace()
1549        .collect::<Vec<_>>()
1550        .join(" ")
1551        .trim()
1552        .to_string()
1553}
1554
1555fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1556    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1557        return (String::new(), None);
1558    }
1559
1560    let mut failures = Vec::new();
1561    let mut saw_success = false;
1562
1563    let extracted = catch_unwind(AssertUnwindSafe(
1564        || -> Result<String, Box<dyn std::error::Error>> {
1565            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1566            extract_first_pdf_page_text(&mut document)
1567        },
1568    ));
1569    match extracted {
1570        Ok(Ok(text)) => {
1571            saw_success = true;
1572            if let Some(normalized) = normalize_pdf_text(text) {
1573                return (normalized, None);
1574            }
1575        }
1576        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1577        Err(payload) => failures.push(format!(
1578            "from-bytes first-page panic: {}",
1579            panic_payload_to_string(payload.as_ref())
1580        )),
1581    }
1582
1583    let extracted = catch_unwind(AssertUnwindSafe(
1584        || -> Result<String, Box<dyn std::error::Error>> {
1585            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1586            extract_pdf_text_from_document(&mut document)
1587        },
1588    ));
1589    match extracted {
1590        Ok(Ok(text)) => {
1591            saw_success = true;
1592            if let Some(normalized) = normalize_pdf_text(text) {
1593                return (normalized, None);
1594            }
1595        }
1596        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1597        Err(payload) => failures.push(format!(
1598            "open full-document panic: {}",
1599            panic_payload_to_string(payload.as_ref())
1600        )),
1601    }
1602
1603    let extracted = catch_unwind(AssertUnwindSafe(
1604        || -> Result<String, Box<dyn std::error::Error>> {
1605            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1606            extract_pdf_text_from_document(&mut document)
1607        },
1608    ));
1609    match extracted {
1610        Ok(Ok(text)) => {
1611            saw_success = true;
1612            if let Some(normalized) = normalize_pdf_text(text) {
1613                return (normalized, None);
1614            }
1615        }
1616        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1617        Err(payload) => failures.push(format!(
1618            "from-bytes full-document panic: {}",
1619            panic_payload_to_string(payload.as_ref())
1620        )),
1621    }
1622
1623    if saw_success || is_non_actionable_pdf_failure(&failures) {
1624        (String::new(), None)
1625    } else {
1626        (
1627            String::new(),
1628            Some(format!(
1629                "PDF text extraction failed after {} attempts: {}",
1630                failures.len(),
1631                failures.join("; ")
1632            )),
1633        )
1634    }
1635}
1636
1637fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1638    !failures.is_empty()
1639        && failures.iter().all(|failure| {
1640            failure.contains("requires a password")
1641                || failure.contains("Invalid cross-reference table")
1642        })
1643}
1644
1645fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1646    if let Some(message) = payload.downcast_ref::<&str>() {
1647        (*message).to_string()
1648    } else if let Some(message) = payload.downcast_ref::<String>() {
1649        message.clone()
1650    } else {
1651        "unknown panic payload".to_string()
1652    }
1653}
1654
1655fn extract_first_pdf_page_text(
1656    document: &mut pdf_oxide::document::PdfDocument,
1657) -> Result<String, Box<dyn std::error::Error>> {
1658    if document.page_count()? == 0 {
1659        return Ok(String::new());
1660    }
1661
1662    let extracted_text = document.extract_text(0)?;
1663    let markdown_text =
1664        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1665    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1666        return Ok(extracted_text);
1667    }
1668
1669    let pipeline_text =
1670        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1671
1672    Ok(merge_pdf_first_page_text(
1673        &extracted_text,
1674        &markdown_text,
1675        &pipeline_text,
1676    ))
1677}
1678
1679fn extract_pdf_text_from_document(
1680    document: &mut pdf_oxide::document::PdfDocument,
1681) -> Result<String, Box<dyn std::error::Error>> {
1682    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1683}
1684
1685fn normalize_pdf_text(text: String) -> Option<String> {
1686    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1687    (!normalized.trim().is_empty()).then_some(normalized)
1688}
1689
1690fn merge_pdf_first_page_text(
1691    _extracted_text: &str,
1692    markdown_text: &str,
1693    pipeline_text: &str,
1694) -> String {
1695    let pipeline = pipeline_text.trim();
1696    if pipeline.is_empty() {
1697        return String::new();
1698    }
1699
1700    let prefix = pdf_first_page_heading_prefix(markdown_text);
1701    let Some(prefix) = prefix else {
1702        return pipeline_text.to_string();
1703    };
1704
1705    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1706        pipeline_text.to_string()
1707    } else {
1708        format!("{prefix}\n\n{pipeline}")
1709    }
1710}
1711
1712fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1713    normalize_pdf_heading_comparison_text(text)
1714        .contains(&normalize_pdf_heading_comparison_text(prefix))
1715}
1716
1717fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1718    text.split_whitespace()
1719        .map(|part| part.to_ascii_lowercase())
1720        .collect::<Vec<_>>()
1721        .join(" ")
1722}
1723
1724fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1725    let mut lines = Vec::new();
1726
1727    for line in pdf_markdown_heading_lines(markdown_text) {
1728        push_unique_line(&mut lines, line);
1729    }
1730
1731    (!lines.is_empty()).then(|| lines.join("\n"))
1732}
1733
1734fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1735    text.lines()
1736        .map(str::trim)
1737        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1738        .map(|line| line.trim_matches('#').trim())
1739        .filter(|line| !line.is_empty())
1740        .filter(|line| !looks_like_numbered_section_heading(line))
1741        .take(4)
1742        .map(ToOwned::to_owned)
1743        .collect()
1744}
1745
1746fn push_unique_line(lines: &mut Vec<String>, line: String) {
1747    if !lines.iter().any(|existing| existing == &line) {
1748        lines.push(line);
1749    }
1750}
1751
1752fn looks_like_numbered_section_heading(line: &str) -> bool {
1753    let mut chars = line.chars();
1754    let Some(first) = chars.next() else {
1755        return false;
1756    };
1757
1758    if !first.is_ascii_digit() {
1759        return false;
1760    }
1761
1762    matches!(chars.next(), Some('.'))
1763}
1764
1765fn is_zip_archive(bytes: &[u8]) -> bool {
1766    bytes.starts_with(b"PK\x03\x04")
1767        || bytes.starts_with(b"PK\x05\x06")
1768        || bytes.starts_with(b"PK\x07\x08")
1769}
1770
1771pub fn extract_printable_strings(bytes: &[u8]) -> String {
1772    const MIN_LEN: usize = 4;
1773    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1774    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1775
1776    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1777
1778    fn is_printable_ascii(b: u8) -> bool {
1779        matches!(b, 0x20..=0x7E)
1780    }
1781
1782    let mut out = String::new();
1783    let mut run: Vec<u8> = Vec::new();
1784
1785    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1786        if run.len() >= MIN_LEN {
1787            if !out.is_empty() {
1788                out.push('\n');
1789            }
1790            out.push_str(&String::from_utf8_lossy(run));
1791        }
1792        run.clear();
1793    };
1794
1795    for &b in bytes {
1796        if is_printable_ascii(b) {
1797            run.push(b);
1798        } else {
1799            flush_run(&mut out, &mut run);
1800            if out.len() >= max_output_bytes {
1801                return out;
1802            }
1803        }
1804    }
1805    flush_run(&mut out, &mut run);
1806    if out.len() >= max_output_bytes {
1807        return out;
1808    }
1809
1810    for start in 0..=1 {
1811        run.clear();
1812        let mut i = start;
1813        while i + 1 < bytes.len() {
1814            let b0 = bytes[i];
1815            let b1 = bytes[i + 1];
1816            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1817            if is_printable_ascii(ch) && zero == 0 {
1818                run.push(ch);
1819            } else {
1820                flush_run(&mut out, &mut run);
1821                if out.len() >= max_output_bytes {
1822                    return out;
1823                }
1824            }
1825            i += 2;
1826        }
1827        flush_run(&mut out, &mut run);
1828        if out.len() >= max_output_bytes {
1829            return out;
1830        }
1831    }
1832
1833    out
1834}
1835
1836#[cfg(test)]
1837mod tests {
1838    use std::path::Path;
1839
1840    use super::{
1841        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
1842        extract_printable_strings, extract_text_for_detection,
1843        extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
1844        normalize_mime_type, normalize_pdf_heading_comparison_text,
1845    };
1846
1847    #[test]
1848    fn test_extract_text_for_detection_skips_jar_archives() {
1849        let path = Path::new(
1850            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1851        );
1852        let bytes = std::fs::read(path).expect("failed to read jar fixture");
1853
1854        let (text, kind) = extract_text_for_detection(path, &bytes);
1855
1856        assert!(text.is_empty());
1857        assert_eq!(kind, ExtractedTextKind::None);
1858    }
1859
1860    #[test]
1861    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1862        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1863        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1864
1865        let (text, kind) = extract_text_for_detection(path, &bytes);
1866
1867        assert_eq!(kind, ExtractedTextKind::Pdf);
1868        assert!(text.contains("Redistribution and use in source and binary forms"));
1869    }
1870
1871    #[test]
1872    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1873        let path =
1874            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1875        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1876
1877        let (text, kind) = extract_text_for_detection(path, &bytes);
1878
1879        assert_eq!(kind, ExtractedTextKind::Pdf);
1880        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1881        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1882    }
1883
1884    #[test]
1885    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1886        let path =
1887            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1888        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1889
1890        let (text, kind) = extract_text_for_detection(path, &bytes);
1891
1892        assert_eq!(kind, ExtractedTextKind::Pdf);
1893
1894        let normalized = normalize_pdf_heading_comparison_text(&text);
1895        let heading =
1896            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1897        assert_eq!(normalized.matches(&heading).count(), 1);
1898    }
1899
1900    #[test]
1901    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1902        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1903        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1904
1905        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1906
1907        assert_eq!(kind, ExtractedTextKind::Pdf);
1908        assert!(text.contains("Redistribution and use in source and binary forms"));
1909    }
1910
1911    #[test]
1912    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
1913        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
1914
1915        let (text, kind, scan_error) =
1916            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
1917
1918        assert!(text.is_empty());
1919        assert_eq!(kind, ExtractedTextKind::None);
1920        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
1921        assert!(scan_error.contains("PDF text extraction failed after"));
1922    }
1923
1924    #[test]
1925    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
1926        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1927
1928        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
1929
1930        assert!(text.is_empty());
1931        assert_eq!(kind, ExtractedTextKind::None);
1932    }
1933
1934    #[test]
1935    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
1936        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1937        let text = b"Copyright 2026 Example Project!!!";
1938        bytes[..text.len()].copy_from_slice(text);
1939        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
1940        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
1941
1942        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
1943
1944        assert_ne!(kind, ExtractedTextKind::None);
1945        assert!(text.contains("Copyright 2026 Example Project"));
1946    }
1947
1948    #[test]
1949    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
1950        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
1951        let bytes = std::fs::read(path).expect("read PE fixture");
1952
1953        let (text, kind) = extract_text_for_detection(path, &bytes);
1954
1955        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
1956        assert!(text.contains("License: This program is free software"));
1957        assert!(text.contains("LegalCopyright:"));
1958    }
1959
1960    #[test]
1961    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
1962        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
1963        let text = b"Copyright 2026 Example Project!!!";
1964        bytes[..text.len()].copy_from_slice(text);
1965
1966        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
1967
1968        assert!(text.is_empty());
1969        assert_eq!(kind, ExtractedTextKind::None);
1970    }
1971
1972    #[test]
1973    fn test_non_actionable_pdf_failures_are_suppressed() {
1974        assert!(is_non_actionable_pdf_failure(&[
1975            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
1976            "open full-document: PDF is encrypted and requires a password".to_string(),
1977        ]));
1978        assert!(is_non_actionable_pdf_failure(&[
1979            "from-bytes first-page: Invalid cross-reference table".to_string(),
1980            "open full-document: Invalid cross-reference table".to_string(),
1981        ]));
1982        assert!(!is_non_actionable_pdf_failure(&[
1983            "from-bytes first-page: some other parser failure".to_string(),
1984        ]));
1985    }
1986
1987    #[test]
1988    fn test_extract_text_for_detection_skips_zip_like_archives() {
1989        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1990
1991        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1992        let (crate_text, crate_kind) =
1993            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1994
1995        assert!(whl_text.is_empty());
1996        assert_eq!(whl_kind, ExtractedTextKind::None);
1997        assert!(crate_text.is_empty());
1998        assert_eq!(crate_kind, ExtractedTextKind::None);
1999    }
2000
2001    #[test]
2002    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2003        let path =
2004            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2005        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2006
2007        let (text, kind) = extract_text_for_detection(path, &bytes);
2008
2009        assert_ne!(kind, ExtractedTextKind::None);
2010        assert!(text.contains("Copyright nexB and others (c) 2012"));
2011    }
2012
2013    #[test]
2014    fn test_extract_text_for_detection_reads_font_metadata() {
2015        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2016        let bytes = std::fs::read(path).expect("failed to read font fixture");
2017
2018        let (text, kind) = extract_text_for_detection(path, &bytes);
2019
2020        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2021        assert!(text.contains("License Description:"), "{text}");
2022        assert!(
2023            text.contains("Open Font License") || text.contains("OFL"),
2024            "{text}"
2025        );
2026    }
2027
2028    #[test]
2029    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2030        let bytes = b"abcd\0".repeat(525_000);
2031
2032        let text = extract_printable_strings(&bytes);
2033
2034        assert!(
2035            text.len() > 2_000_000,
2036            "unexpected truncation at {}",
2037            text.len()
2038        );
2039        assert!(text.ends_with("abcd"));
2040    }
2041
2042    #[test]
2043    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2044        let path = Path::new(
2045            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2046        );
2047        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2048
2049        let (text, kind) = extract_text_for_detection(path, &bytes);
2050
2051        assert_eq!(kind, ExtractedTextKind::Decoded);
2052        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2053    }
2054
2055    #[test]
2056    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2057        let path = Path::new(
2058            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2059        );
2060        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2061
2062        let (text, kind) = extract_text_for_detection(path, &bytes);
2063
2064        assert_eq!(kind, ExtractedTextKind::Decoded);
2065        assert!(text.contains("GNU Lesser General Public"));
2066        assert!(text.contains("version"));
2067        assert!(text.contains("2.1 of the License"));
2068    }
2069
2070    #[test]
2071    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2072        assert_eq!(
2073            normalize_mime_type(
2074                Path::new("main.ts"),
2075                b"export const answer = 42;\n",
2076                Some("TypeScript"),
2077                "video/mp2t",
2078            ),
2079            "text/plain"
2080        );
2081    }
2082
2083    #[test]
2084    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2085        assert_eq!(
2086            normalize_mime_type(
2087                Path::new("main.js"),
2088                b"console.log('hello');\n",
2089                Some("JavaScript"),
2090                "application/octet-stream",
2091            ),
2092            "text/plain"
2093        );
2094    }
2095
2096    #[test]
2097    fn test_normalize_mime_type_preserves_binary_video_guess() {
2098        assert_eq!(
2099            normalize_mime_type(
2100                Path::new("main.ts"),
2101                &[0, 159, 146, 150, 0, 1, 2, 3],
2102                Some("TypeScript"),
2103                "video/mp2t",
2104            ),
2105            "video/mp2t"
2106        );
2107    }
2108
2109    #[test]
2110    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2111        assert_eq!(
2112            normalize_mime_type(
2113                Path::new("main.ts"),
2114                &[0, 159, 146, 150],
2115                Some("TypeScript"),
2116                "application/octet-stream",
2117            ),
2118            "application/octet-stream"
2119        );
2120    }
2121
2122    #[test]
2123    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2124        let classification = classify_file_info(Path::new("test.txt"), b"");
2125
2126        assert_eq!(classification.mime_type, "inode/x-empty");
2127        assert_eq!(classification.file_type, "empty");
2128        assert!(!classification.is_binary);
2129        assert!(classification.is_text);
2130        assert!(!classification.is_source);
2131        assert_eq!(classification.programming_language, None);
2132    }
2133
2134    #[test]
2135    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2136        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2137
2138        assert_eq!(classification.mime_type, "application/json");
2139        assert_eq!(classification.file_type, "JSON text data");
2140        assert!(classification.is_text);
2141        assert!(!classification.is_source);
2142        assert_eq!(classification.programming_language, None);
2143    }
2144
2145    #[test]
2146    fn test_classify_file_info_treats_dockerfile_as_source() {
2147        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2148
2149        assert_eq!(
2150            classification.programming_language.as_deref(),
2151            Some("Dockerfile")
2152        );
2153        assert!(classification.is_source);
2154        assert!(!classification.is_script);
2155        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2156    }
2157
2158    #[test]
2159    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2160        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2161
2162        assert_eq!(classification.programming_language, None);
2163        assert!(classification.is_text);
2164        assert!(!classification.is_source);
2165        assert!(!classification.is_script);
2166        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2167    }
2168
2169    #[test]
2170    fn test_classify_file_info_marks_supported_package_archives() {
2171        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2172
2173        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2174        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2175
2176        assert!(egg.is_archive);
2177        assert_eq!(egg.mime_type, "application/zip");
2178        assert_eq!(egg.file_type, "Zip archive data");
2179        assert!(nupkg.is_archive);
2180        assert_eq!(nupkg.mime_type, "application/zip");
2181        assert_eq!(nupkg.file_type, "Zip archive data");
2182    }
2183
2184    #[test]
2185    fn test_classify_file_info_marks_png_as_binary_media() {
2186        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2187
2188        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2189
2190        assert_eq!(classification.mime_type, "image/png");
2191        assert_eq!(classification.file_type, "PNG image data");
2192        assert!(classification.is_binary);
2193        assert!(!classification.is_text);
2194        assert!(classification.is_media);
2195        assert!(!classification.is_archive);
2196        assert!(!classification.is_source);
2197    }
2198
2199    #[test]
2200    fn test_classify_file_info_marks_pdf_as_binary_document() {
2201        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2202
2203        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2204
2205        assert_eq!(classification.mime_type, "application/pdf");
2206        assert_eq!(classification.file_type, "PDF document");
2207        assert!(classification.is_binary);
2208        assert!(!classification.is_text);
2209        assert!(!classification.is_archive);
2210        assert!(!classification.is_media);
2211    }
2212
2213    #[test]
2214    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2215        let classification =
2216            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2217
2218        assert!(classification.is_binary);
2219        assert!(!classification.is_text);
2220        assert!(!classification.is_source);
2221        assert_eq!(classification.programming_language, None);
2222    }
2223
2224    #[test]
2225    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2226        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2227
2228        assert_eq!(classification.programming_language, None);
2229        assert!(classification.is_text);
2230        assert!(!classification.is_source);
2231        assert_eq!(classification.file_type, "YAML text data");
2232    }
2233
2234    #[test]
2235    fn test_classify_file_info_classifies_common_build_manifests() {
2236        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2237        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2238        let gitmodules = classify_file_info(
2239            Path::new(".gitmodules"),
2240            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2241        );
2242
2243        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2244        assert!(gradle.is_source);
2245        assert_eq!(gradle.mime_type, "text/plain");
2246
2247        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2248        assert!(flake.is_source);
2249        assert_eq!(flake.mime_type, "text/plain");
2250
2251        assert_eq!(gitmodules.programming_language, None);
2252        assert!(gitmodules.is_text);
2253        assert!(!gitmodules.is_source);
2254        assert_eq!(gitmodules.file_type, "Git configuration text");
2255    }
2256
2257    #[test]
2258    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2259        let classification = classify_file_info(
2260            Path::new("bin/run"),
2261            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2262        );
2263
2264        assert_eq!(
2265            classification.programming_language.as_deref(),
2266            Some("JavaScript")
2267        );
2268        assert!(classification.is_script);
2269        assert_eq!(
2270            classification.file_type,
2271            "javascript script, UTF-8 Unicode text executable"
2272        );
2273    }
2274
2275    #[test]
2276    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2277        let classification = classify_file_info(
2278            Path::new("script.py"),
2279            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2280        );
2281
2282        assert_eq!(
2283            classification.programming_language.as_deref(),
2284            Some("Python")
2285        );
2286        assert!(classification.is_script);
2287        assert_eq!(classification.file_type, "python script, text executable");
2288    }
2289
2290    #[test]
2291    fn test_classify_file_info_treats_textual_tga_as_media() {
2292        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2293
2294        assert!(classification.is_media);
2295        assert!(classification.is_text);
2296        assert!(!classification.is_binary);
2297    }
2298
2299    #[test]
2300    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2301        let classification =
2302            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2303
2304        assert!(classification.is_binary);
2305        assert!(!classification.is_text);
2306        assert!(!classification.is_source);
2307        assert_eq!(classification.programming_language, None);
2308    }
2309
2310    #[test]
2311    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2312        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2313
2314        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2315
2316        assert!(text.is_empty());
2317        assert_eq!(kind, ExtractedTextKind::None);
2318    }
2319
2320    #[test]
2321    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2322        let cases = [
2323            (
2324                Path::new("bin/run"),
2325                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2326                Some("JavaScript"),
2327                true,
2328                true,
2329            ),
2330            (
2331                Path::new("Dockerfile"),
2332                b"FROM scratch\n".as_slice(),
2333                Some("Dockerfile"),
2334                true,
2335                false,
2336            ),
2337            (
2338                Path::new("package.json"),
2339                br#"{"name":"demo"}"#.as_slice(),
2340                None,
2341                false,
2342                false,
2343            ),
2344            (
2345                Path::new("config.yaml"),
2346                b"key: value\n".as_slice(),
2347                None,
2348                false,
2349                false,
2350            ),
2351            (
2352                Path::new("Makefile"),
2353                b"all:\n\techo hi\n".as_slice(),
2354                None,
2355                false,
2356                false,
2357            ),
2358        ];
2359
2360        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2361            let classification = classify_file_info(path, bytes);
2362
2363            assert_eq!(
2364                classification.programming_language.as_deref(),
2365                expected_language,
2366                "unexpected language for {}",
2367                path.display()
2368            );
2369            assert_eq!(
2370                classification.is_source,
2371                expected_is_source,
2372                "unexpected is_source for {}",
2373                path.display()
2374            );
2375            assert_eq!(
2376                classification.is_script,
2377                expected_is_script,
2378                "unexpected is_script for {}",
2379                path.display()
2380            );
2381        }
2382    }
2383}
provenant/utils/file.rs

provenant/utils/
file.rs