Skip to main content

provenant/utils/
file.rs

1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use content_inspector::{ContentType, inspect};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::utils::language::detect_language;
18
19#[derive(Debug, Clone, Copy, PartialEq, Eq)]
20pub enum ExtractedTextKind {
21    None,
22    Decoded,
23    Pdf,
24    BinaryStrings,
25    ImageMetadata,
26}
27
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct FileInfoClassification {
30    pub mime_type: String,
31    pub file_type: String,
32    pub programming_language: Option<String>,
33    pub is_binary: bool,
34    pub is_text: bool,
35    pub is_archive: bool,
36    pub is_media: bool,
37    pub is_source: bool,
38    pub is_script: bool,
39}
40
41const MAX_IMAGE_METADATA_VALUES: usize = 64;
42const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
43const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
44    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
45];
46const BINARY_EXTENSIONS: &[&str] = &[
47    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
48];
49const ARCHIVE_EXTENSIONS: &[&str] = &[
50    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
51    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
52];
53
54/// Get the last modified date of a file as a `YYYY-MM-DD` string.
55pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
56    metadata.modified().ok().map(|time: std::time::SystemTime| {
57        let seconds_since_epoch = time
58            .duration_since(std::time::UNIX_EPOCH)
59            .unwrap()
60            .as_secs() as i64;
61
62        Utc.timestamp_opt(seconds_since_epoch, 0)
63            .single()
64            .unwrap_or_else(Utc::now)
65            .format("%Y-%m-%d")
66            .to_string()
67    })
68}
69
70/// Check if a path should be excluded based on a list of glob patterns.
71pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
72    let path_str = path.to_string_lossy();
73    let file_name = path
74        .file_name()
75        .map(|name| name.to_string_lossy())
76        .unwrap_or_default();
77
78    for pattern in exclude_patterns {
79        // Match against full path
80        if pattern.matches(&path_str) {
81            return true;
82        }
83
84        // Match against just the file/directory name
85        if pattern.matches(&file_name) {
86            return true;
87        }
88    }
89
90    false
91}
92
93/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
94///
95/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
96/// so it can decode any byte sequence. This matches Python ScanCode's use of
97/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
98pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
99    match String::from_utf8(bytes.to_vec()) {
100        Ok(s) => s,
101        Err(e) => {
102            let bytes = e.into_bytes();
103            // Binary heuristic: >10% control chars (0x00-0x08, 0x0E-0x1F) means binary.
104            let control_count = bytes
105                .iter()
106                .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
107                .count();
108            if control_count > bytes.len() / 10 {
109                return String::new();
110            }
111            bytes.iter().map(|&b| b as char).collect()
112        }
113    }
114}
115
116pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
117    let ext = path
118        .extension()
119        .and_then(|e| e.to_str())
120        .map(|s| s.to_ascii_lowercase());
121    let detected_format = detect_file_format(bytes);
122
123    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
124        let text = extract_pdf_text(path, bytes);
125        return if text.is_empty() {
126            (String::new(), ExtractedTextKind::None)
127        } else {
128            (text, ExtractedTextKind::Pdf)
129        };
130    }
131
132    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
133        let text = extract_image_metadata_text(bytes, format);
134        return if text.is_empty() {
135            if is_supported_image_container(bytes, format) {
136                (String::new(), ExtractedTextKind::None)
137            } else {
138                let decoded = decode_bytes_to_string(bytes);
139                if decoded.is_empty() {
140                    (String::new(), ExtractedTextKind::None)
141                } else {
142                    (decoded, ExtractedTextKind::Decoded)
143                }
144            }
145        } else {
146            (text, ExtractedTextKind::ImageMetadata)
147        };
148    }
149
150    if should_skip_binary_string_extraction(path, bytes, detected_format) {
151        return (String::new(), ExtractedTextKind::None);
152    }
153
154    let decoded = decode_bytes_to_string(bytes);
155    if !decoded.is_empty() {
156        return (decoded, ExtractedTextKind::Decoded);
157    }
158
159    let text = extract_printable_strings(bytes);
160    if text.is_empty() {
161        (String::new(), ExtractedTextKind::None)
162    } else {
163        (text, ExtractedTextKind::BinaryStrings)
164    }
165}
166
167pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
168    let detected_format = detect_file_format(bytes);
169    let detected_language = detect_language(path, bytes);
170    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
171    let is_text = !is_binary;
172    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
173    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
174    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
175    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
176    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
177    let programming_language = is_source.then(|| detected_language.clone()).flatten();
178    let file_type = detect_file_type(
179        path,
180        bytes,
181        detected_format,
182        &mime_type,
183        programming_language.as_deref(),
184        is_binary,
185        is_text,
186        is_archive,
187        is_media,
188        is_script,
189    );
190
191    FileInfoClassification {
192        mime_type,
193        file_type,
194        programming_language,
195        is_binary,
196        is_text,
197        is_archive,
198        is_media,
199        is_source,
200        is_script,
201    }
202}
203
204fn detect_file_format(bytes: &[u8]) -> FileFormat {
205    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
206}
207
208pub fn detect_mime_type(
209    path: &Path,
210    bytes: &[u8],
211    detected_format: FileFormat,
212    programming_language: Option<&str>,
213) -> String {
214    if bytes.is_empty() {
215        return "inode/x-empty".to_string();
216    }
217
218    if is_zip_archive(bytes) {
219        return detect_zip_like_mime(path);
220    }
221
222    if looks_like_deb(bytes, path) {
223        return "application/vnd.debian.binary-package".to_string();
224    }
225
226    if looks_like_rpm(bytes, path) {
227        return "application/x-rpm".to_string();
228    }
229
230    let guessed_mime = from_path(path)
231        .first_or_octet_stream()
232        .essence_str()
233        .to_string();
234
235    let mime_type = match detected_format {
236        FileFormat::Empty => "inode/x-empty".to_string(),
237        FileFormat::PlainText => {
238            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
239                "text/plain".to_string()
240            } else {
241                guessed_mime.clone()
242            }
243        }
244        _ => {
245            let detected_mime = detected_format.media_type();
246            if detected_mime == "application/octet-stream"
247                && guessed_mime != "application/octet-stream"
248            {
249                guessed_mime.clone()
250            } else {
251                detected_mime.to_string()
252            }
253        }
254    };
255
256    normalize_mime_type(path, bytes, programming_language, &mime_type)
257}
258
259fn is_utf8_text(content_type: ContentType) -> bool {
260    matches!(content_type, ContentType::UTF_8 | ContentType::UTF_8_BOM)
261}
262
263fn normalize_mime_type(
264    path: &Path,
265    bytes: &[u8],
266    programming_language: Option<&str>,
267    mime_type: &str,
268) -> String {
269    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
270        return "text/plain".to_string();
271    }
272
273    mime_type.to_string()
274}
275
276fn should_prefer_text_mime(
277    path: &Path,
278    bytes: &[u8],
279    programming_language: Option<&str>,
280    mime_type: &str,
281) -> bool {
282    (is_utf8_text(inspect(bytes)) || !decode_bytes_to_string(bytes).is_empty())
283        && is_textual_source_candidate(path, programming_language)
284        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
285}
286
287fn detect_is_binary(
288    path: &Path,
289    bytes: &[u8],
290    detected_format: FileFormat,
291    programming_language: Option<&str>,
292) -> bool {
293    if matches!(detected_format, FileFormat::Empty | FileFormat::PlainText) {
294        return false;
295    }
296
297    lower_extension(path)
298        .as_deref()
299        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
300        || (!bytes.is_empty()
301            && matches!(inspect(bytes), ContentType::BINARY)
302            && !should_treat_binary_bytes_as_text(path, bytes, programming_language))
303}
304
305fn should_treat_binary_bytes_as_text(
306    path: &Path,
307    bytes: &[u8],
308    programming_language: Option<&str>,
309) -> bool {
310    !decode_bytes_to_string(bytes).is_empty()
311        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
312}
313
314fn detect_is_archive(
315    path: &Path,
316    bytes: &[u8],
317    mime_type: &str,
318    is_text: bool,
319    detected_format: FileFormat,
320) -> bool {
321    if is_text {
322        return false;
323    }
324
325    lower_extension(path)
326        .as_deref()
327        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
328        || matches!(
329            detected_format.kind(),
330            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
331        )
332        || is_zip_archive(bytes)
333        || looks_like_gzip(bytes)
334        || looks_like_bzip2(bytes)
335        || looks_like_xz(bytes)
336        || looks_like_deb(bytes, path)
337        || looks_like_rpm(bytes, path)
338        || looks_like_squashfs(bytes, path)
339        || mime_type.contains("zip")
340        || mime_type.contains("compressed")
341        || mime_type.contains("tar")
342        || mime_type.contains("x-rpm")
343        || mime_type.contains("debian")
344}
345
346fn detect_is_media(
347    path: &Path,
348    bytes: &[u8],
349    mime_type: &str,
350    detected_format: FileFormat,
351) -> bool {
352    media_mime_from_content(bytes).is_some()
353        || matches!(
354            detected_format.kind(),
355            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
356        )
357        || mime_type.starts_with("image/")
358        || mime_type.starts_with("audio/")
359        || mime_type.starts_with("video/")
360        || (mime_type == "application/octet-stream"
361            && lower_extension(path).as_deref() == Some("tga")
362            && !matches!(inspect(bytes), ContentType::BINARY))
363}
364
365fn detect_is_script(
366    path: &Path,
367    bytes: &[u8],
368    programming_language: Option<&str>,
369    is_text: bool,
370) -> bool {
371    if !is_text || is_makefile(path) {
372        return false;
373    }
374
375    bytes.starts_with(b"#!")
376        || lower_extension(path).as_deref().is_some_and(|ext| {
377            matches!(
378                ext,
379                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
380            )
381        })
382        || matches!(
383            programming_language,
384            Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
385        )
386}
387
388fn detect_is_source(
389    path: &Path,
390    programming_language: Option<&str>,
391    is_text: bool,
392    is_script: bool,
393) -> bool {
394    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
395        return false;
396    }
397
398    if is_c_like_source(path) || is_java_like_source(path) {
399        return true;
400    }
401
402    programming_language.is_some() || is_script
403}
404
405#[allow(clippy::too_many_arguments)]
406fn detect_file_type(
407    path: &Path,
408    bytes: &[u8],
409    detected_format: FileFormat,
410    mime_type: &str,
411    programming_language: Option<&str>,
412    is_binary: bool,
413    is_text: bool,
414    is_archive: bool,
415    is_media: bool,
416    is_script: bool,
417) -> String {
418    if bytes.is_empty() {
419        return "empty".to_string();
420    }
421
422    if looks_like_pdf(bytes) {
423        return "PDF document".to_string();
424    }
425
426    if let Some(file_type) = media_file_type_from_content(bytes) {
427        return file_type.to_string();
428    }
429
430    if is_archive {
431        return archive_file_type(path, bytes, detected_format);
432    }
433
434    if is_script {
435        return script_file_type(programming_language, bytes);
436    }
437
438    if is_text {
439        if lower_extension(path).as_deref() == Some("json") {
440            return "JSON text data".to_string();
441        }
442        if lower_extension(path).as_deref() == Some("xml") {
443            return "XML text data".to_string();
444        }
445        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
446            return "YAML text data".to_string();
447        }
448        if lower_extension(path).as_deref() == Some("toml") {
449            return "TOML text data".to_string();
450        }
451        if matches!(
452            lower_extension(path).as_deref(),
453            Some("ini" | "cfg" | "conf")
454        ) {
455            return "INI text data".to_string();
456        }
457        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
458            return "Git configuration text".to_string();
459        }
460        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
461            return text_file_type(bytes);
462        }
463        if programming_language.is_some() && !is_media {
464            return text_file_type(bytes);
465        }
466        return text_file_type(bytes);
467    }
468
469    if let Some(file_type) = format_based_file_type(detected_format) {
470        return file_type;
471    }
472
473    if is_binary && mime_type == "application/octet-stream" {
474        return "data".to_string();
475    }
476
477    mime_type.to_string()
478}
479
480fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
481    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
482        return true;
483    }
484
485    if matches!(
486        lower_file_name(path).as_str(),
487        "dockerfile"
488            | "containerfile"
489            | "containerfile.core"
490            | "apkbuild"
491            | "podfile"
492            | "meson.build"
493            | "build"
494            | "workspace"
495            | "buck"
496            | "default.nix"
497            | "flake.nix"
498            | "shell.nix"
499    ) {
500        return true;
501    }
502
503    path.extension()
504        .and_then(|ext| ext.to_str())
505        .is_some_and(|ext| {
506            matches!(
507                ext.to_ascii_lowercase().as_str(),
508                "rs" | "py"
509                    | "js"
510                    | "mjs"
511                    | "cjs"
512                    | "jsx"
513                    | "ts"
514                    | "mts"
515                    | "cts"
516                    | "tsx"
517                    | "c"
518                    | "cpp"
519                    | "cc"
520                    | "cxx"
521                    | "h"
522                    | "hpp"
523                    | "m"
524                    | "mm"
525                    | "s"
526                    | "asm"
527                    | "java"
528                    | "go"
529                    | "rb"
530                    | "php"
531                    | "pl"
532                    | "swift"
533                    | "sh"
534                    | "bash"
535                    | "zsh"
536                    | "fish"
537                    | "ksh"
538                    | "ps1"
539                    | "psm1"
540                    | "psd1"
541                    | "awk"
542                    | "kt"
543                    | "kts"
544                    | "dart"
545                    | "scala"
546                    | "groovy"
547                    | "gradle"
548                    | "gvy"
549                    | "gy"
550                    | "gsh"
551                    | "cs"
552                    | "fs"
553                    | "fsx"
554                    | "r"
555                    | "lua"
556                    | "jl"
557                    | "ex"
558                    | "exs"
559                    | "clj"
560                    | "cljs"
561                    | "cljc"
562                    | "hs"
563                    | "erl"
564                    | "nix"
565                    | "zig"
566                    | "bzl"
567                    | "bazel"
568                    | "star"
569                    | "sky"
570                    | "ml"
571                    | "mli"
572                    | "tex"
573            )
574        })
575}
576
577fn is_source_like_language(language: &str) -> bool {
578    matches!(
579        language,
580        "Rust"
581            | "Python"
582            | "JavaScript"
583            | "TypeScript"
584            | "JavaScript/TypeScript"
585            | "C"
586            | "C++"
587            | "Objective-C"
588            | "Objective-C++"
589            | "GAS"
590            | "Java"
591            | "Go"
592            | "Ruby"
593            | "PHP"
594            | "Perl"
595            | "Swift"
596            | "Shell"
597            | "PowerShell"
598            | "Awk"
599            | "Kotlin"
600            | "Dart"
601            | "Scala"
602            | "C#"
603            | "F#"
604            | "R"
605            | "Lua"
606            | "Julia"
607            | "Elixir"
608            | "Clojure"
609            | "Haskell"
610            | "Erlang"
611            | "Groovy"
612            | "Nix"
613            | "Zig"
614            | "Starlark"
615            | "OCaml"
616            | "Meson"
617            | "TeX"
618            | "Dockerfile"
619            | "Makefile"
620    )
621}
622
623fn extension(path: &Path) -> Option<&str> {
624    path.extension().and_then(|ext| ext.to_str())
625}
626
627fn lower_extension(path: &Path) -> Option<String> {
628    extension(path).map(|ext| ext.to_ascii_lowercase())
629}
630
631fn lower_file_name(path: &Path) -> String {
632    path.file_name()
633        .and_then(|name| name.to_str())
634        .map(|name| name.to_ascii_lowercase())
635        .unwrap_or_default()
636}
637
638fn is_plain_text(path: &Path) -> bool {
639    lower_extension(path)
640        .as_deref()
641        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
642}
643
644fn is_makefile(path: &Path) -> bool {
645    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
646}
647
648fn is_source_map(path: &Path) -> bool {
649    let path_lower = path.to_string_lossy().to_ascii_lowercase();
650    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
651}
652
653fn is_c_like_source(path: &Path) -> bool {
654    lower_extension(path).as_deref().is_some_and(|ext| {
655        matches!(
656            ext,
657            "c" | "cc"
658                | "cp"
659                | "cpp"
660                | "cxx"
661                | "c++"
662                | "h"
663                | "hh"
664                | "hpp"
665                | "hxx"
666                | "h++"
667                | "i"
668                | "ii"
669                | "m"
670                | "s"
671                | "asm"
672        )
673    })
674}
675
676fn is_java_like_source(path: &Path) -> bool {
677    lower_extension(path)
678        .as_deref()
679        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
680}
681
682fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
683    match detected_format {
684        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
685        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
686        format => Some(match format.kind() {
687            FileFormatKind::Image => short_name_or_name(&format, "image data"),
688            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
689            FileFormatKind::Video => short_name_or_name(&format, "video data"),
690            _ => format.name().to_string(),
691        }),
692    }
693}
694
695fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
696    format
697        .short_name()
698        .map(|short_name| format!("{short_name} {suffix}"))
699        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
700}
701
702fn detect_zip_like_mime(path: &Path) -> String {
703    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
704        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
705        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
706            "application/java-archive".to_string()
707        }
708        _ => "application/zip".to_string(),
709    }
710}
711
712fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
713    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
714        Some("image/png")
715    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
716        Some("image/jpeg")
717    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
718        Some("image/tiff")
719    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
720        Some("image/webp")
721    } else {
722        None
723    }
724}
725
726fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
727    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
728        Some("PNG image data")
729    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
730        Some("JPEG image data")
731    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
732        Some("TIFF image data")
733    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
734        Some("WebP image data")
735    } else {
736        None
737    }
738}
739
740fn looks_like_pdf(bytes: &[u8]) -> bool {
741    bytes.starts_with(b"%PDF-")
742}
743
744fn looks_like_gzip(bytes: &[u8]) -> bool {
745    bytes.starts_with(&[0x1f, 0x8b])
746}
747
748fn looks_like_bzip2(bytes: &[u8]) -> bool {
749    bytes.starts_with(b"BZh")
750}
751
752fn looks_like_xz(bytes: &[u8]) -> bool {
753    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
754}
755
756fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
757    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
758}
759
760fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
761    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
762}
763
764fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
765    lower_extension(path)
766        .as_deref()
767        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
768        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
769            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
770}
771
772fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
773    if looks_like_deb(bytes, path) {
774        "debian binary package (format 2.0)".to_string()
775    } else if looks_like_rpm(bytes, path) {
776        "RPM package".to_string()
777    } else if looks_like_squashfs(bytes, path) {
778        "Squashfs filesystem".to_string()
779    } else if looks_like_gzip(bytes) {
780        "gzip compressed data".to_string()
781    } else if looks_like_bzip2(bytes) {
782        "bzip2 compressed data".to_string()
783    } else if looks_like_xz(bytes) {
784        "XZ compressed data".to_string()
785    } else if is_zip_archive(bytes) {
786        "Zip archive data".to_string()
787    } else if lower_extension(path).as_deref() == Some("gem") {
788        "POSIX tar archive".to_string()
789    } else if let Some(file_type) = format_based_file_type(detected_format) {
790        file_type
791    } else {
792        "archive data".to_string()
793    }
794}
795
796fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
797    let suffix = text_executable_label(bytes);
798
799    match programming_language {
800        Some("Python") => format!("python script, {suffix}"),
801        Some("Ruby") => format!("ruby script, {suffix}"),
802        Some("Perl") => format!("perl script, {suffix}"),
803        Some("PHP") => format!("php script, {suffix}"),
804        Some("Shell") => format!("shell script, {suffix}"),
805        Some("JavaScript") => format!("javascript script, {suffix}"),
806        Some("TypeScript") => format!("typescript script, {suffix}"),
807        Some("PowerShell") => format!("powershell script, {suffix}"),
808        Some("Awk") => format!("awk script, {suffix}"),
809        _ => format!("script, {suffix}"),
810    }
811}
812
813fn text_file_type(bytes: &[u8]) -> String {
814    text_label(bytes).to_string()
815}
816
817fn text_label(bytes: &[u8]) -> &'static str {
818    if std::str::from_utf8(bytes).is_ok() {
819        if bytes.contains(&b'\n') {
820            "UTF-8 Unicode text"
821        } else {
822            "UTF-8 Unicode text, with no line terminators"
823        }
824    } else if bytes.contains(&b'\n') {
825        "text"
826    } else {
827        "text, with no line terminators"
828    }
829}
830
831fn text_executable_label(bytes: &[u8]) -> &'static str {
832    if std::str::from_utf8(bytes).is_ok() {
833        if bytes.contains(&b'\n') {
834            "UTF-8 Unicode text executable"
835        } else {
836            "UTF-8 Unicode text executable, with no line terminators"
837        }
838    } else if bytes.contains(&b'\n') {
839        "text executable"
840    } else {
841        "text executable, with no line terminators"
842    }
843}
844
845fn supported_image_metadata_format(
846    ext: Option<&str>,
847    detected_format: FileFormat,
848) -> Option<ImageFormat> {
849    match ext {
850        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
851        Some("png") => Some(ImageFormat::Png),
852        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
853        Some("webp") => Some(ImageFormat::WebP),
854        _ => match detected_format.media_type() {
855            "image/jpeg" => Some(ImageFormat::Jpeg),
856            "image/png" => Some(ImageFormat::Png),
857            "image/tiff" => Some(ImageFormat::Tiff),
858            "image/webp" => Some(ImageFormat::WebP),
859            _ => None,
860        },
861    }
862}
863
864fn should_skip_binary_string_extraction(
865    path: &Path,
866    bytes: &[u8],
867    detected_format: FileFormat,
868) -> bool {
869    matches!(lower_extension(path).as_deref(), Some("pdf"))
870        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
871            .is_some()
872        || media_mime_from_content(bytes).is_some()
873        || is_zip_archive(bytes)
874        || looks_like_gzip(bytes)
875        || looks_like_bzip2(bytes)
876        || looks_like_xz(bytes)
877        || looks_like_deb(bytes, path)
878        || looks_like_rpm(bytes, path)
879        || looks_like_squashfs(bytes, path)
880}
881
882fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
883    match format {
884        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
885        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
886        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
887        ImageFormat::WebP => {
888            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
889        }
890        _ => false,
891    }
892}
893
894fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
895    let mut values = Vec::new();
896    values.extend(extract_exif_metadata_values(bytes));
897    values.extend(extract_xmp_metadata_values(bytes, format));
898    values_to_text(values)
899}
900
901fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
902    let mut cursor = BufReader::new(Cursor::new(bytes));
903    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
904        Ok(exif) => exif,
905        Err(_) => return Vec::new(),
906    };
907
908    let mut values = Vec::new();
909    for field in exif.fields() {
910        let rendered = match field.tag {
911            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
912                Some(field.display_value().with_unit(&exif).to_string())
913            }
914            exif::Tag::Artist => Some(format!(
915                "Author: {}",
916                field.display_value().with_unit(&exif)
917            )),
918            _ => None,
919        };
920
921        if let Some(rendered) = rendered {
922            values.push(rendered);
923        }
924    }
925
926    values
927}
928
929fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
930    let xmp = match extract_raw_xmp_packet(bytes, format) {
931        Some(xmp) => xmp,
932        None => return Vec::new(),
933    };
934
935    parse_xmp_values(&xmp)
936}
937
938fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
939    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
940    if let Ok(mut decoder) = reader.into_decoder()
941        && let Ok(Some(xmp)) = decoder.xmp_metadata()
942    {
943        return Some(xmp);
944    }
945
946    match format {
947        ImageFormat::Png => extract_png_xmp_packet(bytes),
948        _ => None,
949    }
950}
951
952fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
953    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
954
955    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
956        return None;
957    }
958
959    let mut offset = PNG_SIGNATURE.len();
960    while offset + 12 <= bytes.len() {
961        let length = u32::from_be_bytes([
962            bytes[offset],
963            bytes[offset + 1],
964            bytes[offset + 2],
965            bytes[offset + 3],
966        ]) as usize;
967        let chunk_start = offset + 8;
968        let chunk_end = chunk_start + length;
969        if chunk_end + 4 > bytes.len() {
970            return None;
971        }
972
973        let chunk_type = &bytes[offset + 4..offset + 8];
974        if chunk_type == b"iTXt" {
975            let data = &bytes[chunk_start..chunk_end];
976            if let Some(xmp) = parse_png_itxt_xmp(data) {
977                return Some(xmp);
978            }
979        }
980
981        offset = chunk_end + 4;
982    }
983
984    None
985}
986
987fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
988    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
989
990    let keyword_end = data.iter().position(|&b| b == 0)?;
991    if &data[..keyword_end] != XMP_KEYWORD {
992        return None;
993    }
994
995    let mut cursor = keyword_end + 1;
996    let compression_flag = *data.get(cursor)?;
997    cursor += 1;
998    let compression_method = *data.get(cursor)?;
999    cursor += 1;
1000    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1001        return None;
1002    }
1003
1004    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1005    cursor = language_end + 1;
1006
1007    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1008    cursor = translated_end + 1;
1009
1010    let text_bytes = &data[cursor..];
1011    if compression_flag == 1 {
1012        let mut decoder = ZlibDecoder::new(text_bytes);
1013        let mut decoded = Vec::new();
1014        decoder.read_to_end(&mut decoded).ok()?;
1015        Some(decoded)
1016    } else {
1017        Some(text_bytes.to_vec())
1018    }
1019}
1020
1021fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1022    let mut reader = XmlReader::from_reader(xmp);
1023    reader.config_mut().trim_text(true);
1024
1025    let mut buf = Vec::new();
1026    let mut stack: Vec<String> = Vec::new();
1027    let mut values = Vec::new();
1028
1029    loop {
1030        match reader.read_event_into(&mut buf) {
1031            Ok(Event::Start(e)) => {
1032                stack.push(local_xml_name(e.name().as_ref()));
1033            }
1034            Ok(Event::End(_)) => {
1035                stack.pop();
1036            }
1037            Ok(Event::Empty(_)) => {}
1038            Ok(Event::Text(text)) => {
1039                if let Some(field) = stack
1040                    .iter()
1041                    .rev()
1042                    .find_map(|name| allowed_xmp_field(name.as_str()))
1043                    && let Ok(decoded) = text.decode()
1044                {
1045                    let decoded = decoded.into_owned();
1046                    if !decoded.trim().is_empty() {
1047                        values.push(format_xmp_value(field, &decoded));
1048                    }
1049                }
1050            }
1051            Ok(Event::CData(text)) => {
1052                if let Some(field) = stack
1053                    .iter()
1054                    .rev()
1055                    .find_map(|name| allowed_xmp_field(name.as_str()))
1056                    && let Ok(decoded) = text.decode()
1057                {
1058                    let decoded = decoded.into_owned();
1059                    if !decoded.trim().is_empty() {
1060                        values.push(format_xmp_value(field, &decoded));
1061                    }
1062                }
1063            }
1064            Ok(Event::Eof) | Err(_) => break,
1065            _ => {}
1066        }
1067        buf.clear();
1068    }
1069
1070    values
1071}
1072
1073fn local_xml_name(name: &[u8]) -> String {
1074    let name = std::str::from_utf8(name).unwrap_or_default();
1075    name.rsplit(':').next().unwrap_or(name).to_string()
1076}
1077
1078fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1079    match name {
1080        "creator" => Some("creator"),
1081        "rights" => Some("rights"),
1082        "description" => Some("description"),
1083        "title" => Some("title"),
1084        "subject" => Some("subject"),
1085        "UsageTerms" => Some("usage_terms"),
1086        "WebStatement" => Some("web_statement"),
1087        _ => None,
1088    }
1089}
1090
1091fn format_xmp_value(field: &str, value: &str) -> String {
1092    match field {
1093        "creator" => format!("Author: {value}"),
1094        _ => value.to_string(),
1095    }
1096}
1097
1098fn values_to_text(values: Vec<String>) -> String {
1099    let mut seen = BTreeSet::new();
1100    let mut lines = Vec::new();
1101    let mut total_bytes = 0usize;
1102
1103    for value in values {
1104        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1105            break;
1106        }
1107
1108        let normalized = normalize_metadata_value(&value);
1109        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1110            continue;
1111        }
1112
1113        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1114        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1115            break;
1116        }
1117
1118        total_bytes += added_bytes;
1119        lines.push(normalized);
1120    }
1121
1122    lines.join("\n")
1123}
1124
1125fn normalize_metadata_value(value: &str) -> String {
1126    value
1127        .chars()
1128        .filter(|&ch| ch != '\0')
1129        .collect::<String>()
1130        .split_whitespace()
1131        .collect::<Vec<_>>()
1132        .join(" ")
1133        .trim()
1134        .to_string()
1135}
1136
1137fn extract_pdf_text(path: &Path, bytes: &[u8]) -> String {
1138    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1139        return String::new();
1140    }
1141
1142    let extracted = catch_unwind(AssertUnwindSafe(
1143        || -> Result<String, Box<dyn std::error::Error>> {
1144            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1145            extract_first_pdf_page_text(&mut document)
1146        },
1147    ));
1148    if let Ok(Ok(text)) = extracted
1149        && let Some(normalized) = normalize_pdf_text(text)
1150    {
1151        return normalized;
1152    }
1153
1154    let extracted = catch_unwind(AssertUnwindSafe(
1155        || -> Result<String, Box<dyn std::error::Error>> {
1156            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1157            extract_pdf_text_from_document(&mut document)
1158        },
1159    ));
1160    if let Ok(Ok(text)) = extracted
1161        && let Some(normalized) = normalize_pdf_text(text)
1162    {
1163        return normalized;
1164    }
1165
1166    let extracted = catch_unwind(AssertUnwindSafe(
1167        || -> Result<String, Box<dyn std::error::Error>> {
1168            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1169            extract_pdf_text_from_document(&mut document)
1170        },
1171    ));
1172    if let Ok(Ok(text)) = extracted
1173        && let Some(normalized) = normalize_pdf_text(text)
1174    {
1175        return normalized;
1176    }
1177
1178    String::new()
1179}
1180
1181fn extract_first_pdf_page_text(
1182    document: &mut pdf_oxide::document::PdfDocument,
1183) -> Result<String, Box<dyn std::error::Error>> {
1184    if document.page_count()? == 0 {
1185        return Ok(String::new());
1186    }
1187
1188    let extracted_text = document.extract_text(0)?;
1189    let markdown_text =
1190        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1191    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1192        return Ok(extracted_text);
1193    }
1194
1195    let pipeline_text =
1196        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1197
1198    Ok(merge_pdf_first_page_text(
1199        &extracted_text,
1200        &markdown_text,
1201        &pipeline_text,
1202    ))
1203}
1204
1205fn extract_pdf_text_from_document(
1206    document: &mut pdf_oxide::document::PdfDocument,
1207) -> Result<String, Box<dyn std::error::Error>> {
1208    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1209}
1210
1211fn normalize_pdf_text(text: String) -> Option<String> {
1212    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1213    (!normalized.trim().is_empty()).then_some(normalized)
1214}
1215
1216fn merge_pdf_first_page_text(
1217    _extracted_text: &str,
1218    markdown_text: &str,
1219    pipeline_text: &str,
1220) -> String {
1221    let pipeline = pipeline_text.trim();
1222    if pipeline.is_empty() {
1223        return String::new();
1224    }
1225
1226    let prefix = pdf_first_page_heading_prefix(markdown_text);
1227    let Some(prefix) = prefix else {
1228        return pipeline_text.to_string();
1229    };
1230
1231    if pipeline.contains(&prefix) {
1232        pipeline_text.to_string()
1233    } else {
1234        format!("{prefix}\n\n{pipeline}")
1235    }
1236}
1237
1238fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1239    let mut lines = Vec::new();
1240
1241    for line in pdf_markdown_heading_lines(markdown_text) {
1242        push_unique_line(&mut lines, line);
1243    }
1244
1245    (!lines.is_empty()).then(|| lines.join("\n"))
1246}
1247
1248fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1249    text.lines()
1250        .map(str::trim)
1251        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1252        .map(|line| line.trim_matches('#').trim())
1253        .filter(|line| !line.is_empty())
1254        .filter(|line| !looks_like_numbered_section_heading(line))
1255        .take(4)
1256        .map(ToOwned::to_owned)
1257        .collect()
1258}
1259
1260fn push_unique_line(lines: &mut Vec<String>, line: String) {
1261    if !lines.iter().any(|existing| existing == &line) {
1262        lines.push(line);
1263    }
1264}
1265
1266fn looks_like_numbered_section_heading(line: &str) -> bool {
1267    let mut chars = line.chars();
1268    let Some(first) = chars.next() else {
1269        return false;
1270    };
1271
1272    if !first.is_ascii_digit() {
1273        return false;
1274    }
1275
1276    matches!(chars.next(), Some('.'))
1277}
1278
1279fn is_zip_archive(bytes: &[u8]) -> bool {
1280    bytes.starts_with(b"PK\x03\x04")
1281        || bytes.starts_with(b"PK\x05\x06")
1282        || bytes.starts_with(b"PK\x07\x08")
1283}
1284
1285pub fn extract_printable_strings(bytes: &[u8]) -> String {
1286    const MIN_LEN: usize = 4;
1287    const MAX_OUTPUT_BYTES: usize = 2_000_000;
1288
1289    fn is_printable_ascii(b: u8) -> bool {
1290        matches!(b, 0x20..=0x7E)
1291    }
1292
1293    let mut out = String::new();
1294    let mut run: Vec<u8> = Vec::new();
1295
1296    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1297        if run.len() >= MIN_LEN {
1298            if !out.is_empty() {
1299                out.push('\n');
1300            }
1301            out.push_str(&String::from_utf8_lossy(run));
1302        }
1303        run.clear();
1304    };
1305
1306    for &b in bytes {
1307        if is_printable_ascii(b) {
1308            run.push(b);
1309        } else {
1310            flush_run(&mut out, &mut run);
1311            if out.len() >= MAX_OUTPUT_BYTES {
1312                return out;
1313            }
1314        }
1315    }
1316    flush_run(&mut out, &mut run);
1317    if out.len() >= MAX_OUTPUT_BYTES {
1318        return out;
1319    }
1320
1321    for start in 0..=1 {
1322        run.clear();
1323        let mut i = start;
1324        while i + 1 < bytes.len() {
1325            let b0 = bytes[i];
1326            let b1 = bytes[i + 1];
1327            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1328            if is_printable_ascii(ch) && zero == 0 {
1329                run.push(ch);
1330            } else {
1331                flush_run(&mut out, &mut run);
1332                if out.len() >= MAX_OUTPUT_BYTES {
1333                    return out;
1334                }
1335            }
1336            i += 2;
1337        }
1338        flush_run(&mut out, &mut run);
1339        if out.len() >= MAX_OUTPUT_BYTES {
1340            return out;
1341        }
1342    }
1343
1344    out
1345}
1346
1347#[cfg(test)]
1348mod tests {
1349    use std::path::Path;
1350
1351    use super::{
1352        ExtractedTextKind, classify_file_info, extract_text_for_detection, normalize_mime_type,
1353    };
1354
1355    #[test]
1356    fn test_extract_text_for_detection_skips_jar_archives() {
1357        let path = Path::new(
1358            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1359        );
1360        let bytes = std::fs::read(path).expect("failed to read jar fixture");
1361
1362        let (text, kind) = extract_text_for_detection(path, &bytes);
1363
1364        assert!(text.is_empty());
1365        assert_eq!(kind, ExtractedTextKind::None);
1366    }
1367
1368    #[test]
1369    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1370        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1371        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1372
1373        let (text, kind) = extract_text_for_detection(path, &bytes);
1374
1375        assert_eq!(kind, ExtractedTextKind::Pdf);
1376        assert!(text.contains("Redistribution and use in source and binary forms"));
1377    }
1378
1379    #[test]
1380    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1381        let path =
1382            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1383        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1384
1385        let (text, kind) = extract_text_for_detection(path, &bytes);
1386
1387        assert_eq!(kind, ExtractedTextKind::Pdf);
1388        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1389        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1390    }
1391
1392    #[test]
1393    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1394        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1395        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1396
1397        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1398
1399        assert_eq!(kind, ExtractedTextKind::Pdf);
1400        assert!(text.contains("Redistribution and use in source and binary forms"));
1401    }
1402
1403    #[test]
1404    fn test_extract_text_for_detection_skips_zip_like_archives() {
1405        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1406
1407        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1408        let (crate_text, crate_kind) =
1409            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1410
1411        assert!(whl_text.is_empty());
1412        assert_eq!(whl_kind, ExtractedTextKind::None);
1413        assert!(crate_text.is_empty());
1414        assert_eq!(crate_kind, ExtractedTextKind::None);
1415    }
1416
1417    #[test]
1418    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
1419        let path =
1420            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
1421        let bytes = std::fs::read(path).expect("failed to read lib fixture");
1422
1423        let (text, kind) = extract_text_for_detection(path, &bytes);
1424
1425        assert_ne!(kind, ExtractedTextKind::None);
1426        assert!(text.contains("Copyright nexB and others (c) 2012"));
1427    }
1428
1429    #[test]
1430    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
1431        let path = Path::new(
1432            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
1433        );
1434        let bytes = std::fs::read(path).expect("failed to read svg fixture");
1435
1436        let (text, kind) = extract_text_for_detection(path, &bytes);
1437
1438        assert_eq!(kind, ExtractedTextKind::Decoded);
1439        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
1440    }
1441
1442    #[test]
1443    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
1444        assert_eq!(
1445            normalize_mime_type(
1446                Path::new("main.ts"),
1447                b"export const answer = 42;\n",
1448                Some("TypeScript"),
1449                "video/mp2t",
1450            ),
1451            "text/plain"
1452        );
1453    }
1454
1455    #[test]
1456    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
1457        assert_eq!(
1458            normalize_mime_type(
1459                Path::new("main.js"),
1460                b"console.log('hello');\n",
1461                Some("JavaScript"),
1462                "application/octet-stream",
1463            ),
1464            "text/plain"
1465        );
1466    }
1467
1468    #[test]
1469    fn test_normalize_mime_type_preserves_binary_video_guess() {
1470        assert_eq!(
1471            normalize_mime_type(
1472                Path::new("main.ts"),
1473                &[0, 159, 146, 150, 0, 1, 2, 3],
1474                Some("TypeScript"),
1475                "video/mp2t",
1476            ),
1477            "video/mp2t"
1478        );
1479    }
1480
1481    #[test]
1482    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
1483        assert_eq!(
1484            normalize_mime_type(
1485                Path::new("main.ts"),
1486                &[0, 159, 146, 150],
1487                Some("TypeScript"),
1488                "application/octet-stream",
1489            ),
1490            "application/octet-stream"
1491        );
1492    }
1493
1494    #[test]
1495    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
1496        let classification = classify_file_info(Path::new("test.txt"), b"");
1497
1498        assert_eq!(classification.mime_type, "inode/x-empty");
1499        assert_eq!(classification.file_type, "empty");
1500        assert!(!classification.is_binary);
1501        assert!(classification.is_text);
1502        assert!(!classification.is_source);
1503        assert_eq!(classification.programming_language, None);
1504    }
1505
1506    #[test]
1507    fn test_classify_file_info_keeps_json_out_of_programming_language() {
1508        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
1509
1510        assert_eq!(classification.mime_type, "application/json");
1511        assert_eq!(classification.file_type, "JSON text data");
1512        assert!(classification.is_text);
1513        assert!(!classification.is_source);
1514        assert_eq!(classification.programming_language, None);
1515    }
1516
1517    #[test]
1518    fn test_classify_file_info_treats_dockerfile_as_source() {
1519        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
1520
1521        assert_eq!(
1522            classification.programming_language.as_deref(),
1523            Some("Dockerfile")
1524        );
1525        assert!(classification.is_source);
1526        assert!(!classification.is_script);
1527        assert_eq!(classification.file_type, "UTF-8 Unicode text");
1528    }
1529
1530    #[test]
1531    fn test_classify_file_info_treats_makefile_as_text_not_source() {
1532        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
1533
1534        assert_eq!(classification.programming_language, None);
1535        assert!(classification.is_text);
1536        assert!(!classification.is_source);
1537        assert!(!classification.is_script);
1538        assert_eq!(classification.file_type, "UTF-8 Unicode text");
1539    }
1540
1541    #[test]
1542    fn test_classify_file_info_marks_supported_package_archives() {
1543        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
1544
1545        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
1546        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
1547
1548        assert!(egg.is_archive);
1549        assert_eq!(egg.mime_type, "application/zip");
1550        assert_eq!(egg.file_type, "Zip archive data");
1551        assert!(nupkg.is_archive);
1552        assert_eq!(nupkg.mime_type, "application/zip");
1553        assert_eq!(nupkg.file_type, "Zip archive data");
1554    }
1555
1556    #[test]
1557    fn test_classify_file_info_marks_png_as_binary_media() {
1558        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
1559
1560        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
1561
1562        assert_eq!(classification.mime_type, "image/png");
1563        assert_eq!(classification.file_type, "PNG image data");
1564        assert!(classification.is_binary);
1565        assert!(!classification.is_text);
1566        assert!(classification.is_media);
1567        assert!(!classification.is_archive);
1568        assert!(!classification.is_source);
1569    }
1570
1571    #[test]
1572    fn test_classify_file_info_marks_binary_blobs_as_binary() {
1573        let classification =
1574            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
1575
1576        assert!(classification.is_binary);
1577        assert!(!classification.is_text);
1578        assert!(!classification.is_source);
1579        assert_eq!(classification.programming_language, None);
1580    }
1581
1582    #[test]
1583    fn test_classify_file_info_treats_yaml_as_text_not_source() {
1584        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
1585
1586        assert_eq!(classification.programming_language, None);
1587        assert!(classification.is_text);
1588        assert!(!classification.is_source);
1589        assert_eq!(classification.file_type, "YAML text data");
1590    }
1591
1592    #[test]
1593    fn test_classify_file_info_classifies_common_build_manifests() {
1594        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
1595        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
1596        let gitmodules = classify_file_info(
1597            Path::new(".gitmodules"),
1598            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
1599        );
1600
1601        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
1602        assert!(gradle.is_source);
1603        assert_eq!(gradle.mime_type, "text/plain");
1604
1605        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
1606        assert!(flake.is_source);
1607        assert_eq!(flake.mime_type, "text/plain");
1608
1609        assert_eq!(gitmodules.programming_language, None);
1610        assert!(gitmodules.is_text);
1611        assert!(!gitmodules.is_source);
1612        assert_eq!(gitmodules.file_type, "Git configuration text");
1613    }
1614
1615    #[test]
1616    fn test_classify_file_info_labels_javascript_shebang_scripts() {
1617        let classification = classify_file_info(
1618            Path::new("bin/run"),
1619            b"#!/usr/bin/env node\nconsole.log('hello');\n",
1620        );
1621
1622        assert_eq!(
1623            classification.programming_language.as_deref(),
1624            Some("JavaScript")
1625        );
1626        assert!(classification.is_script);
1627        assert_eq!(
1628            classification.file_type,
1629            "javascript script, UTF-8 Unicode text executable"
1630        );
1631    }
1632
1633    #[test]
1634    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
1635        let classification = classify_file_info(
1636            Path::new("script.py"),
1637            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
1638        );
1639
1640        assert_eq!(
1641            classification.programming_language.as_deref(),
1642            Some("Python")
1643        );
1644        assert!(classification.is_script);
1645        assert_eq!(classification.file_type, "python script, text executable");
1646    }
1647
1648    #[test]
1649    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
1650        let cases = [
1651            (
1652                Path::new("bin/run"),
1653                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
1654                Some("JavaScript"),
1655                true,
1656                true,
1657            ),
1658            (
1659                Path::new("Dockerfile"),
1660                b"FROM scratch\n".as_slice(),
1661                Some("Dockerfile"),
1662                true,
1663                false,
1664            ),
1665            (
1666                Path::new("package.json"),
1667                br#"{"name":"demo"}"#.as_slice(),
1668                None,
1669                false,
1670                false,
1671            ),
1672            (
1673                Path::new("config.yaml"),
1674                b"key: value\n".as_slice(),
1675                None,
1676                false,
1677                false,
1678            ),
1679            (
1680                Path::new("Makefile"),
1681                b"all:\n\techo hi\n".as_slice(),
1682                None,
1683                false,
1684                false,
1685            ),
1686        ];
1687
1688        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
1689            let classification = classify_file_info(path, bytes);
1690
1691            assert_eq!(
1692                classification.programming_language.as_deref(),
1693                expected_language,
1694                "unexpected language for {}",
1695                path.display()
1696            );
1697            assert_eq!(
1698                classification.is_source,
1699                expected_is_source,
1700                "unexpected is_source for {}",
1701                path.display()
1702            );
1703            assert_eq!(
1704                classification.is_script,
1705                expected_is_script,
1706                "unexpected is_script for {}",
1707                path.display()
1708            );
1709        }
1710    }
1711}