provenant/utils/
file.rs

1use std::collections::BTreeSet;
2use std::fs;
3use std::io::{BufReader, Cursor, Read};
4use std::panic::{AssertUnwindSafe, catch_unwind};
5use std::path::Path;
6
7use chrono::{TimeZone, Utc};
8use file_format::{FileFormat, Kind as FileFormatKind};
9use flate2::read::ZlibDecoder;
10use glob::Pattern;
11use image::{ImageDecoder, ImageFormat, ImageReader};
12use mime_guess::from_path;
13use quick_xml::events::Event;
14use quick_xml::reader::Reader as XmlReader;
15
16use crate::utils::language::detect_language;
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq)]
19pub enum ExtractedTextKind {
20    None,
21    Decoded,
22    Pdf,
23    BinaryStrings,
24    ImageMetadata,
25}
26
27#[derive(Debug, Clone, PartialEq, Eq)]
28pub struct FileInfoClassification {
29    pub mime_type: String,
30    pub file_type: String,
31    pub programming_language: Option<String>,
32    pub is_binary: bool,
33    pub is_text: bool,
34    pub is_archive: bool,
35    pub is_media: bool,
36    pub is_source: bool,
37    pub is_script: bool,
38}
39
40const MAX_IMAGE_METADATA_VALUES: usize = 64;
41const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
42const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
43const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
44    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
45];
46const BINARY_EXTENSIONS: &[&str] = &[
47    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
48];
49const ARCHIVE_EXTENSIONS: &[&str] = &[
50    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
51    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
52];
53
54/// Get the last modified date of a file as a `YYYY-MM-DD` string.
55pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
56    metadata.modified().ok().map(|time: std::time::SystemTime| {
57        let seconds_since_epoch = time
58            .duration_since(std::time::UNIX_EPOCH)
59            .unwrap()
60            .as_secs() as i64;
61
62        Utc.timestamp_opt(seconds_since_epoch, 0)
63            .single()
64            .unwrap_or_else(Utc::now)
65            .format("%Y-%m-%d")
66            .to_string()
67    })
68}
69
70/// Check if a path should be excluded based on a list of glob patterns.
71pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
72    let path_str = path.to_string_lossy();
73    let file_name = path
74        .file_name()
75        .map(|name| name.to_string_lossy())
76        .unwrap_or_default();
77
78    for pattern in exclude_patterns {
79        // Match against full path
80        if pattern.matches(&path_str) {
81            return true;
82        }
83
84        // Match against just the file/directory name
85        if pattern.matches(&file_name) {
86            return true;
87        }
88    }
89
90    false
91}
92
93/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
94///
95/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
96/// so it can decode any byte sequence. This matches Python ScanCode's use of
97/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
98pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
99    match String::from_utf8(bytes.to_vec()) {
100        Ok(s) => s,
101        Err(e) => {
102            let bytes = e.into_bytes();
103            if has_binary_control_chars(&bytes) {
104                return String::new();
105            }
106            bytes.iter().map(|&b| b as char).collect()
107        }
108    }
109}
110
111pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
112    let ext = path
113        .extension()
114        .and_then(|e| e.to_str())
115        .map(|s| s.to_ascii_lowercase());
116    let detected_format = detect_file_format(bytes);
117
118    if looks_like_rtf(bytes, ext.as_deref()) {
119        let text = extract_rtf_text(bytes);
120        return if text.trim().is_empty() {
121            (String::new(), ExtractedTextKind::None)
122        } else {
123            (text, ExtractedTextKind::Decoded)
124        };
125    }
126
127    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
128        let text = extract_pdf_text(path, bytes);
129        return if text.is_empty() {
130            (String::new(), ExtractedTextKind::None)
131        } else {
132            (text, ExtractedTextKind::Pdf)
133        };
134    }
135
136    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
137        let text = extract_image_metadata_text(bytes, format);
138        return if text.is_empty() {
139            if is_supported_image_container(bytes, format) {
140                (String::new(), ExtractedTextKind::None)
141            } else {
142                let decoded = decode_bytes_to_string(bytes);
143                if decoded.is_empty() {
144                    (String::new(), ExtractedTextKind::None)
145                } else {
146                    (decoded, ExtractedTextKind::Decoded)
147                }
148            }
149        } else {
150            (text, ExtractedTextKind::ImageMetadata)
151        };
152    }
153
154    if should_skip_binary_string_extraction(path, bytes, detected_format) {
155        return (String::new(), ExtractedTextKind::None);
156    }
157
158    let decoded = decode_bytes_to_string(bytes);
159    if !decoded.is_empty() {
160        return (decoded, ExtractedTextKind::Decoded);
161    }
162
163    let text = extract_printable_strings(bytes);
164    if text.is_empty() {
165        (String::new(), ExtractedTextKind::None)
166    } else {
167        (text, ExtractedTextKind::BinaryStrings)
168    }
169}
170
171pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
172    let detected_format = detect_file_format(bytes);
173    let detected_language = detect_language(path, bytes);
174    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
175    let is_text = !is_binary;
176    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
177    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
178    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
179    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
180    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
181    let programming_language = is_source.then(|| detected_language.clone()).flatten();
182    let file_type = detect_file_type(
183        path,
184        bytes,
185        detected_format,
186        &mime_type,
187        programming_language.as_deref(),
188        is_binary,
189        is_text,
190        is_archive,
191        is_media,
192        is_script,
193    );
194
195    FileInfoClassification {
196        mime_type,
197        file_type,
198        programming_language,
199        is_binary,
200        is_text,
201        is_archive,
202        is_media,
203        is_source,
204        is_script,
205    }
206}
207
208fn detect_file_format(bytes: &[u8]) -> FileFormat {
209    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
210}
211
212fn is_utf8_text(bytes: &[u8]) -> bool {
213    std::str::from_utf8(bytes).is_ok()
214}
215
216fn has_binary_control_chars(bytes: &[u8]) -> bool {
217    let control_count = bytes
218        .iter()
219        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
220        .count();
221    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
222}
223
224fn has_decodable_text(bytes: &[u8]) -> bool {
225    bytes.is_empty() || is_utf8_text(bytes) || !has_binary_control_chars(bytes)
226}
227
228fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
229    if bytes.is_empty() || is_utf8_text(bytes) {
230        return true;
231    }
232
233    let printable_count = bytes
234        .iter()
235        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
236        .count();
237    printable_count * 2 >= bytes.len()
238}
239
240fn is_textual_media_type(media_type: &str) -> bool {
241    media_type.starts_with("text/")
242        || matches!(
243            media_type,
244            "application/json" | "application/xml" | "text/xml"
245        )
246        || media_type.ends_with("+json")
247        || media_type.ends_with("+xml")
248}
249
250fn is_textual_format(detected_format: FileFormat) -> bool {
251    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
252        || is_textual_media_type(detected_format.media_type())
253}
254
255fn is_known_binary_format(detected_format: FileFormat) -> bool {
256    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
257        && !is_textual_format(detected_format)
258}
259
260pub fn detect_mime_type(
261    path: &Path,
262    bytes: &[u8],
263    detected_format: FileFormat,
264    programming_language: Option<&str>,
265) -> String {
266    if bytes.is_empty() {
267        return "inode/x-empty".to_string();
268    }
269
270    if is_zip_archive(bytes) {
271        return detect_zip_like_mime(path);
272    }
273
274    if looks_like_deb(bytes, path) {
275        return "application/vnd.debian.binary-package".to_string();
276    }
277
278    if looks_like_rpm(bytes, path) {
279        return "application/x-rpm".to_string();
280    }
281
282    let guessed_mime = from_path(path)
283        .first_or_octet_stream()
284        .essence_str()
285        .to_string();
286
287    let mime_type = match detected_format {
288        FileFormat::Empty => "inode/x-empty".to_string(),
289        FileFormat::PlainText => {
290            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
291                "text/plain".to_string()
292            } else {
293                guessed_mime.clone()
294            }
295        }
296        _ => {
297            let detected_mime = detected_format.media_type();
298            if detected_mime == "application/octet-stream"
299                && guessed_mime != "application/octet-stream"
300            {
301                guessed_mime.clone()
302            } else {
303                detected_mime.to_string()
304            }
305        }
306    };
307
308    normalize_mime_type(path, bytes, programming_language, &mime_type)
309}
310
311fn normalize_mime_type(
312    path: &Path,
313    bytes: &[u8],
314    programming_language: Option<&str>,
315    mime_type: &str,
316) -> String {
317    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
318        return "text/plain".to_string();
319    }
320
321    mime_type.to_string()
322}
323
324fn should_prefer_text_mime(
325    path: &Path,
326    bytes: &[u8],
327    programming_language: Option<&str>,
328    mime_type: &str,
329) -> bool {
330    has_decodable_text(bytes)
331        && looks_like_textual_bytes(bytes)
332        && is_textual_source_candidate(path, programming_language)
333        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
334}
335
336fn detect_is_binary(
337    path: &Path,
338    bytes: &[u8],
339    detected_format: FileFormat,
340    programming_language: Option<&str>,
341) -> bool {
342    if is_textual_format(detected_format) {
343        return false;
344    }
345
346    if lower_extension(path)
347        .as_deref()
348        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
349    {
350        return true;
351    }
352
353    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
354        return false;
355    }
356
357    has_binary_control_chars(bytes)
358        || is_known_binary_format(detected_format)
359        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
360            && !looks_like_textual_bytes(bytes))
361}
362
363fn should_treat_binary_bytes_as_text(
364    path: &Path,
365    bytes: &[u8],
366    programming_language: Option<&str>,
367) -> bool {
368    has_decodable_text(bytes)
369        && looks_like_textual_bytes(bytes)
370        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
371}
372
373fn detect_is_archive(
374    path: &Path,
375    bytes: &[u8],
376    mime_type: &str,
377    is_text: bool,
378    detected_format: FileFormat,
379) -> bool {
380    if is_text {
381        return false;
382    }
383
384    lower_extension(path)
385        .as_deref()
386        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
387        || matches!(
388            detected_format.kind(),
389            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
390        )
391        || is_zip_archive(bytes)
392        || looks_like_gzip(bytes)
393        || looks_like_bzip2(bytes)
394        || looks_like_xz(bytes)
395        || looks_like_deb(bytes, path)
396        || looks_like_rpm(bytes, path)
397        || looks_like_squashfs(bytes, path)
398        || mime_type.contains("zip")
399        || mime_type.contains("compressed")
400        || mime_type.contains("tar")
401        || mime_type.contains("x-rpm")
402        || mime_type.contains("debian")
403}
404
405fn detect_is_media(
406    path: &Path,
407    bytes: &[u8],
408    mime_type: &str,
409    detected_format: FileFormat,
410) -> bool {
411    media_mime_from_content(bytes).is_some()
412        || matches!(
413            detected_format.kind(),
414            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
415        )
416        || mime_type.starts_with("image/")
417        || mime_type.starts_with("audio/")
418        || mime_type.starts_with("video/")
419        || (mime_type == "application/octet-stream"
420            && lower_extension(path).as_deref() == Some("tga")
421            && !has_binary_control_chars(bytes))
422}
423
424fn detect_is_script(
425    path: &Path,
426    bytes: &[u8],
427    programming_language: Option<&str>,
428    is_text: bool,
429) -> bool {
430    if !is_text || is_makefile(path) {
431        return false;
432    }
433
434    bytes.starts_with(b"#!")
435        || lower_extension(path).as_deref().is_some_and(|ext| {
436            matches!(
437                ext,
438                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
439            )
440        })
441        || matches!(
442            programming_language,
443            Some("Shell" | "Python" | "Ruby" | "Perl" | "PHP" | "PowerShell" | "Awk")
444        )
445}
446
447fn detect_is_source(
448    path: &Path,
449    programming_language: Option<&str>,
450    is_text: bool,
451    is_script: bool,
452) -> bool {
453    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
454        return false;
455    }
456
457    if is_c_like_source(path) || is_java_like_source(path) {
458        return true;
459    }
460
461    programming_language.is_some() || is_script
462}
463
464#[allow(clippy::too_many_arguments)]
465fn detect_file_type(
466    path: &Path,
467    bytes: &[u8],
468    detected_format: FileFormat,
469    mime_type: &str,
470    programming_language: Option<&str>,
471    is_binary: bool,
472    is_text: bool,
473    is_archive: bool,
474    is_media: bool,
475    is_script: bool,
476) -> String {
477    if bytes.is_empty() {
478        return "empty".to_string();
479    }
480
481    if looks_like_pdf(bytes) {
482        return "PDF document".to_string();
483    }
484
485    if let Some(file_type) = media_file_type_from_content(bytes) {
486        return file_type.to_string();
487    }
488
489    if is_archive {
490        return archive_file_type(path, bytes, detected_format);
491    }
492
493    if is_script {
494        return script_file_type(programming_language, bytes);
495    }
496
497    if is_text {
498        if lower_extension(path).as_deref() == Some("json") {
499            return "JSON text data".to_string();
500        }
501        if lower_extension(path).as_deref() == Some("xml") {
502            return "XML text data".to_string();
503        }
504        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
505            return "YAML text data".to_string();
506        }
507        if lower_extension(path).as_deref() == Some("toml") {
508            return "TOML text data".to_string();
509        }
510        if matches!(
511            lower_extension(path).as_deref(),
512            Some("ini" | "cfg" | "conf")
513        ) {
514            return "INI text data".to_string();
515        }
516        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
517            return "Git configuration text".to_string();
518        }
519        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
520            return text_file_type(bytes);
521        }
522        if programming_language.is_some() && !is_media {
523            return text_file_type(bytes);
524        }
525        return text_file_type(bytes);
526    }
527
528    if let Some(file_type) = format_based_file_type(detected_format) {
529        return file_type;
530    }
531
532    if is_binary && mime_type == "application/octet-stream" {
533        return "data".to_string();
534    }
535
536    mime_type.to_string()
537}
538
539fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
540    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
541        return true;
542    }
543
544    if matches!(
545        lower_file_name(path).as_str(),
546        "dockerfile"
547            | "containerfile"
548            | "containerfile.core"
549            | "apkbuild"
550            | "podfile"
551            | "meson.build"
552            | "build"
553            | "workspace"
554            | "buck"
555            | "default.nix"
556            | "flake.nix"
557            | "shell.nix"
558    ) {
559        return true;
560    }
561
562    path.extension()
563        .and_then(|ext| ext.to_str())
564        .is_some_and(|ext| {
565            matches!(
566                ext.to_ascii_lowercase().as_str(),
567                "rs" | "py"
568                    | "js"
569                    | "mjs"
570                    | "cjs"
571                    | "jsx"
572                    | "ts"
573                    | "mts"
574                    | "cts"
575                    | "tsx"
576                    | "c"
577                    | "cpp"
578                    | "cc"
579                    | "cxx"
580                    | "h"
581                    | "hpp"
582                    | "m"
583                    | "mm"
584                    | "s"
585                    | "asm"
586                    | "java"
587                    | "go"
588                    | "rb"
589                    | "php"
590                    | "pl"
591                    | "swift"
592                    | "sh"
593                    | "bash"
594                    | "zsh"
595                    | "fish"
596                    | "ksh"
597                    | "ps1"
598                    | "psm1"
599                    | "psd1"
600                    | "awk"
601                    | "kt"
602                    | "kts"
603                    | "dart"
604                    | "scala"
605                    | "groovy"
606                    | "gradle"
607                    | "gvy"
608                    | "gy"
609                    | "gsh"
610                    | "cs"
611                    | "fs"
612                    | "fsx"
613                    | "r"
614                    | "lua"
615                    | "jl"
616                    | "ex"
617                    | "exs"
618                    | "clj"
619                    | "cljs"
620                    | "cljc"
621                    | "hs"
622                    | "erl"
623                    | "nix"
624                    | "zig"
625                    | "bzl"
626                    | "bazel"
627                    | "star"
628                    | "sky"
629                    | "ml"
630                    | "mli"
631                    | "tex"
632            )
633        })
634}
635
636fn is_source_like_language(language: &str) -> bool {
637    matches!(
638        language,
639        "Rust"
640            | "Python"
641            | "JavaScript"
642            | "TypeScript"
643            | "JavaScript/TypeScript"
644            | "C"
645            | "C++"
646            | "Objective-C"
647            | "Objective-C++"
648            | "GAS"
649            | "Java"
650            | "Go"
651            | "Ruby"
652            | "PHP"
653            | "Perl"
654            | "Swift"
655            | "Shell"
656            | "PowerShell"
657            | "Awk"
658            | "Kotlin"
659            | "Dart"
660            | "Scala"
661            | "C#"
662            | "F#"
663            | "R"
664            | "Lua"
665            | "Julia"
666            | "Elixir"
667            | "Clojure"
668            | "Haskell"
669            | "Erlang"
670            | "Groovy"
671            | "Nix"
672            | "Zig"
673            | "Starlark"
674            | "OCaml"
675            | "Meson"
676            | "TeX"
677            | "Dockerfile"
678            | "Makefile"
679    )
680}
681
682fn extension(path: &Path) -> Option<&str> {
683    path.extension().and_then(|ext| ext.to_str())
684}
685
686fn lower_extension(path: &Path) -> Option<String> {
687    extension(path).map(|ext| ext.to_ascii_lowercase())
688}
689
690fn lower_file_name(path: &Path) -> String {
691    path.file_name()
692        .and_then(|name| name.to_str())
693        .map(|name| name.to_ascii_lowercase())
694        .unwrap_or_default()
695}
696
697fn is_plain_text(path: &Path) -> bool {
698    lower_extension(path)
699        .as_deref()
700        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
701}
702
703fn is_makefile(path: &Path) -> bool {
704    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
705}
706
707fn is_source_map(path: &Path) -> bool {
708    let path_lower = path.to_string_lossy().to_ascii_lowercase();
709    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
710}
711
712fn is_c_like_source(path: &Path) -> bool {
713    lower_extension(path).as_deref().is_some_and(|ext| {
714        matches!(
715            ext,
716            "c" | "cc"
717                | "cp"
718                | "cpp"
719                | "cxx"
720                | "c++"
721                | "h"
722                | "hh"
723                | "hpp"
724                | "hxx"
725                | "h++"
726                | "i"
727                | "ii"
728                | "m"
729                | "s"
730                | "asm"
731        )
732    })
733}
734
735fn is_java_like_source(path: &Path) -> bool {
736    lower_extension(path)
737        .as_deref()
738        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
739}
740
741fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
742    match detected_format {
743        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
744        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
745        format => Some(match format.kind() {
746            FileFormatKind::Image => short_name_or_name(&format, "image data"),
747            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
748            FileFormatKind::Video => short_name_or_name(&format, "video data"),
749            _ => format.name().to_string(),
750        }),
751    }
752}
753
754fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
755    format
756        .short_name()
757        .map(|short_name| format!("{short_name} {suffix}"))
758        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
759}
760
761fn detect_zip_like_mime(path: &Path) -> String {
762    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
763        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
764        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
765            "application/java-archive".to_string()
766        }
767        _ => "application/zip".to_string(),
768    }
769}
770
771fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
772    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
773        Some("image/png")
774    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
775        Some("image/jpeg")
776    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
777        Some("image/tiff")
778    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
779        Some("image/webp")
780    } else {
781        None
782    }
783}
784
785fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
786    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
787        Some("PNG image data")
788    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
789        Some("JPEG image data")
790    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
791        Some("TIFF image data")
792    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
793        Some("WebP image data")
794    } else {
795        None
796    }
797}
798
799fn looks_like_pdf(bytes: &[u8]) -> bool {
800    bytes.starts_with(b"%PDF-")
801}
802
803fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
804    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
805}
806
807fn extract_rtf_text(bytes: &[u8]) -> String {
808    let text = String::from_utf8_lossy(bytes);
809    let chars: Vec<char> = text.chars().collect();
810    let mut output = String::new();
811    let mut index = 0usize;
812
813    while index < chars.len() {
814        match chars[index] {
815            '{' | '}' => {
816                index += 1;
817            }
818            '\\' => {
819                index += 1;
820                if index >= chars.len() {
821                    break;
822                }
823
824                match chars[index] {
825                    '\\' | '{' | '}' => {
826                        output.push(chars[index]);
827                        index += 1;
828                    }
829                    '\'' => {
830                        if index + 2 < chars.len() {
831                            let hex = [chars[index + 1], chars[index + 2]];
832                            let hex: String = hex.iter().collect();
833                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
834                                output.push(value as char);
835                                index += 3;
836                                continue;
837                            }
838                        }
839                        index += 1;
840                    }
841                    control if control.is_ascii_alphabetic() => {
842                        let start = index;
843                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
844                            index += 1;
845                        }
846                        let control_word: String = chars[start..index].iter().collect();
847
848                        let number_start = index;
849                        if index < chars.len()
850                            && (chars[index] == '-' || chars[index].is_ascii_digit())
851                        {
852                            index += 1;
853                            while index < chars.len() && chars[index].is_ascii_digit() {
854                                index += 1;
855                            }
856                        }
857                        let parameter: String = chars[number_start..index].iter().collect();
858
859                        if index < chars.len() && chars[index] == ' ' {
860                            index += 1;
861                        }
862
863                        match control_word.as_str() {
864                            "par" | "line" => output.push('\n'),
865                            "tab" => output.push('\t'),
866                            "emdash" => output.push('—'),
867                            "endash" => output.push('–'),
868                            "bullet" => output.push('•'),
869                            "lquote" | "rquote" => output.push('\''),
870                            "ldblquote" | "rdblquote" => output.push('"'),
871                            "u" => {
872                                if let Ok(codepoint) = parameter.parse::<i32>() {
873                                    let normalized = if codepoint < 0 {
874                                        codepoint + 65_536
875                                    } else {
876                                        codepoint
877                                    };
878                                    if let Ok(normalized) = u32::try_from(normalized)
879                                        && let Some(ch) = char::from_u32(normalized)
880                                    {
881                                        output.push(ch);
882                                    }
883                                }
884
885                                if index < chars.len()
886                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
887                                {
888                                    index += 1;
889                                }
890                            }
891                            _ => {}
892                        }
893                    }
894                    _ => {
895                        index += 1;
896                    }
897                }
898            }
899            ch => {
900                output.push(ch);
901                index += 1;
902            }
903        }
904    }
905
906    output
907        .replace(['\r', '\u{0c}'], "\n")
908        .lines()
909        .map(str::trim_end)
910        .collect::<Vec<_>>()
911        .join("\n")
912}
913
914fn looks_like_gzip(bytes: &[u8]) -> bool {
915    bytes.starts_with(&[0x1f, 0x8b])
916}
917
918fn looks_like_bzip2(bytes: &[u8]) -> bool {
919    bytes.starts_with(b"BZh")
920}
921
922fn looks_like_xz(bytes: &[u8]) -> bool {
923    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
924}
925
926fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
927    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
928}
929
930fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
931    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
932}
933
934fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
935    lower_extension(path)
936        .as_deref()
937        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
938        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
939            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
940}
941
942fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
943    if looks_like_deb(bytes, path) {
944        "debian binary package (format 2.0)".to_string()
945    } else if looks_like_rpm(bytes, path) {
946        "RPM package".to_string()
947    } else if looks_like_squashfs(bytes, path) {
948        "Squashfs filesystem".to_string()
949    } else if looks_like_gzip(bytes) {
950        "gzip compressed data".to_string()
951    } else if looks_like_bzip2(bytes) {
952        "bzip2 compressed data".to_string()
953    } else if looks_like_xz(bytes) {
954        "XZ compressed data".to_string()
955    } else if is_zip_archive(bytes) {
956        "Zip archive data".to_string()
957    } else if lower_extension(path).as_deref() == Some("gem") {
958        "POSIX tar archive".to_string()
959    } else if let Some(file_type) = format_based_file_type(detected_format) {
960        file_type
961    } else {
962        "archive data".to_string()
963    }
964}
965
966fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
967    let suffix = text_executable_label(bytes);
968
969    match programming_language {
970        Some("Python") => format!("python script, {suffix}"),
971        Some("Ruby") => format!("ruby script, {suffix}"),
972        Some("Perl") => format!("perl script, {suffix}"),
973        Some("PHP") => format!("php script, {suffix}"),
974        Some("Shell") => format!("shell script, {suffix}"),
975        Some("JavaScript") => format!("javascript script, {suffix}"),
976        Some("TypeScript") => format!("typescript script, {suffix}"),
977        Some("PowerShell") => format!("powershell script, {suffix}"),
978        Some("Awk") => format!("awk script, {suffix}"),
979        _ => format!("script, {suffix}"),
980    }
981}
982
983fn text_file_type(bytes: &[u8]) -> String {
984    text_label(bytes).to_string()
985}
986
987fn text_label(bytes: &[u8]) -> &'static str {
988    if std::str::from_utf8(bytes).is_ok() {
989        if bytes.contains(&b'\n') {
990            "UTF-8 Unicode text"
991        } else {
992            "UTF-8 Unicode text, with no line terminators"
993        }
994    } else if bytes.contains(&b'\n') {
995        "text"
996    } else {
997        "text, with no line terminators"
998    }
999}
1000
1001fn text_executable_label(bytes: &[u8]) -> &'static str {
1002    if std::str::from_utf8(bytes).is_ok() {
1003        if bytes.contains(&b'\n') {
1004            "UTF-8 Unicode text executable"
1005        } else {
1006            "UTF-8 Unicode text executable, with no line terminators"
1007        }
1008    } else if bytes.contains(&b'\n') {
1009        "text executable"
1010    } else {
1011        "text executable, with no line terminators"
1012    }
1013}
1014
1015fn supported_image_metadata_format(
1016    ext: Option<&str>,
1017    detected_format: FileFormat,
1018) -> Option<ImageFormat> {
1019    match ext {
1020        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1021        Some("png") => Some(ImageFormat::Png),
1022        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1023        Some("webp") => Some(ImageFormat::WebP),
1024        _ => match detected_format.media_type() {
1025            "image/jpeg" => Some(ImageFormat::Jpeg),
1026            "image/png" => Some(ImageFormat::Png),
1027            "image/tiff" => Some(ImageFormat::Tiff),
1028            "image/webp" => Some(ImageFormat::WebP),
1029            _ => None,
1030        },
1031    }
1032}
1033
1034fn should_skip_binary_string_extraction(
1035    path: &Path,
1036    bytes: &[u8],
1037    detected_format: FileFormat,
1038) -> bool {
1039    matches!(lower_extension(path).as_deref(), Some("pdf"))
1040        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1041            .is_some()
1042        || (matches!(
1043            detected_format.kind(),
1044            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1045        ) && !is_textual_format(detected_format))
1046        || media_mime_from_content(bytes).is_some()
1047        || is_zip_archive(bytes)
1048        || looks_like_gzip(bytes)
1049        || looks_like_bzip2(bytes)
1050        || looks_like_xz(bytes)
1051        || looks_like_deb(bytes, path)
1052        || looks_like_rpm(bytes, path)
1053        || looks_like_squashfs(bytes, path)
1054}
1055
1056fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1057    match format {
1058        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1059        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1060        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1061        ImageFormat::WebP => {
1062            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1063        }
1064        _ => false,
1065    }
1066}
1067
1068fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1069    let mut values = Vec::new();
1070    values.extend(extract_exif_metadata_values(bytes));
1071    values.extend(extract_xmp_metadata_values(bytes, format));
1072    values_to_text(values)
1073}
1074
1075fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1076    let mut cursor = BufReader::new(Cursor::new(bytes));
1077    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1078        Ok(exif) => exif,
1079        Err(_) => return Vec::new(),
1080    };
1081
1082    let mut values = Vec::new();
1083    for field in exif.fields() {
1084        let rendered = match field.tag {
1085            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1086                Some(field.display_value().with_unit(&exif).to_string())
1087            }
1088            exif::Tag::Artist => Some(format!(
1089                "Author: {}",
1090                field.display_value().with_unit(&exif)
1091            )),
1092            _ => None,
1093        };
1094
1095        if let Some(rendered) = rendered {
1096            values.push(rendered);
1097        }
1098    }
1099
1100    values
1101}
1102
1103fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1104    let xmp = match extract_raw_xmp_packet(bytes, format) {
1105        Some(xmp) => xmp,
1106        None => return Vec::new(),
1107    };
1108
1109    parse_xmp_values(&xmp)
1110}
1111
1112fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1113    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1114    if let Ok(mut decoder) = reader.into_decoder()
1115        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1116    {
1117        return Some(xmp);
1118    }
1119
1120    match format {
1121        ImageFormat::Png => extract_png_xmp_packet(bytes),
1122        _ => None,
1123    }
1124}
1125
1126fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1127    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1128
1129    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1130        return None;
1131    }
1132
1133    let mut offset = PNG_SIGNATURE.len();
1134    while offset + 12 <= bytes.len() {
1135        let length = u32::from_be_bytes([
1136            bytes[offset],
1137            bytes[offset + 1],
1138            bytes[offset + 2],
1139            bytes[offset + 3],
1140        ]) as usize;
1141        let chunk_start = offset + 8;
1142        let chunk_end = chunk_start + length;
1143        if chunk_end + 4 > bytes.len() {
1144            return None;
1145        }
1146
1147        let chunk_type = &bytes[offset + 4..offset + 8];
1148        if chunk_type == b"iTXt" {
1149            let data = &bytes[chunk_start..chunk_end];
1150            if let Some(xmp) = parse_png_itxt_xmp(data) {
1151                return Some(xmp);
1152            }
1153        }
1154
1155        offset = chunk_end + 4;
1156    }
1157
1158    None
1159}
1160
1161fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1162    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1163
1164    let keyword_end = data.iter().position(|&b| b == 0)?;
1165    if &data[..keyword_end] != XMP_KEYWORD {
1166        return None;
1167    }
1168
1169    let mut cursor = keyword_end + 1;
1170    let compression_flag = *data.get(cursor)?;
1171    cursor += 1;
1172    let compression_method = *data.get(cursor)?;
1173    cursor += 1;
1174    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1175        return None;
1176    }
1177
1178    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1179    cursor = language_end + 1;
1180
1181    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1182    cursor = translated_end + 1;
1183
1184    let text_bytes = &data[cursor..];
1185    if compression_flag == 1 {
1186        let mut decoder = ZlibDecoder::new(text_bytes);
1187        let mut decoded = Vec::new();
1188        decoder.read_to_end(&mut decoded).ok()?;
1189        Some(decoded)
1190    } else {
1191        Some(text_bytes.to_vec())
1192    }
1193}
1194
1195fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1196    let mut reader = XmlReader::from_reader(xmp);
1197    reader.config_mut().trim_text(true);
1198
1199    let mut buf = Vec::new();
1200    let mut stack: Vec<String> = Vec::new();
1201    let mut values = Vec::new();
1202
1203    loop {
1204        match reader.read_event_into(&mut buf) {
1205            Ok(Event::Start(e)) => {
1206                stack.push(local_xml_name(e.name().as_ref()));
1207            }
1208            Ok(Event::End(_)) => {
1209                stack.pop();
1210            }
1211            Ok(Event::Empty(_)) => {}
1212            Ok(Event::Text(text)) => {
1213                if let Some(field) = stack
1214                    .iter()
1215                    .rev()
1216                    .find_map(|name| allowed_xmp_field(name.as_str()))
1217                    && let Ok(decoded) = text.decode()
1218                {
1219                    let decoded = decoded.into_owned();
1220                    if !decoded.trim().is_empty() {
1221                        values.push(format_xmp_value(field, &decoded));
1222                    }
1223                }
1224            }
1225            Ok(Event::CData(text)) => {
1226                if let Some(field) = stack
1227                    .iter()
1228                    .rev()
1229                    .find_map(|name| allowed_xmp_field(name.as_str()))
1230                    && let Ok(decoded) = text.decode()
1231                {
1232                    let decoded = decoded.into_owned();
1233                    if !decoded.trim().is_empty() {
1234                        values.push(format_xmp_value(field, &decoded));
1235                    }
1236                }
1237            }
1238            Ok(Event::Eof) | Err(_) => break,
1239            _ => {}
1240        }
1241        buf.clear();
1242    }
1243
1244    values
1245}
1246
1247fn local_xml_name(name: &[u8]) -> String {
1248    let name = std::str::from_utf8(name).unwrap_or_default();
1249    name.rsplit(':').next().unwrap_or(name).to_string()
1250}
1251
1252fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1253    match name {
1254        "creator" => Some("creator"),
1255        "rights" => Some("rights"),
1256        "description" => Some("description"),
1257        "title" => Some("title"),
1258        "subject" => Some("subject"),
1259        "UsageTerms" => Some("usage_terms"),
1260        "WebStatement" => Some("web_statement"),
1261        _ => None,
1262    }
1263}
1264
1265fn format_xmp_value(field: &str, value: &str) -> String {
1266    match field {
1267        "creator" => format!("Author: {value}"),
1268        _ => value.to_string(),
1269    }
1270}
1271
1272fn values_to_text(values: Vec<String>) -> String {
1273    let mut seen = BTreeSet::new();
1274    let mut lines = Vec::new();
1275    let mut total_bytes = 0usize;
1276
1277    for value in values {
1278        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1279            break;
1280        }
1281
1282        let normalized = normalize_metadata_value(&value);
1283        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1284            continue;
1285        }
1286
1287        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1288        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1289            break;
1290        }
1291
1292        total_bytes += added_bytes;
1293        lines.push(normalized);
1294    }
1295
1296    lines.join("\n")
1297}
1298
1299fn normalize_metadata_value(value: &str) -> String {
1300    value
1301        .chars()
1302        .filter(|&ch| ch != '\0')
1303        .collect::<String>()
1304        .split_whitespace()
1305        .collect::<Vec<_>>()
1306        .join(" ")
1307        .trim()
1308        .to_string()
1309}
1310
1311fn extract_pdf_text(path: &Path, bytes: &[u8]) -> String {
1312    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1313        return String::new();
1314    }
1315
1316    let extracted = catch_unwind(AssertUnwindSafe(
1317        || -> Result<String, Box<dyn std::error::Error>> {
1318            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1319            extract_first_pdf_page_text(&mut document)
1320        },
1321    ));
1322    if let Ok(Ok(text)) = extracted
1323        && let Some(normalized) = normalize_pdf_text(text)
1324    {
1325        return normalized;
1326    }
1327
1328    let extracted = catch_unwind(AssertUnwindSafe(
1329        || -> Result<String, Box<dyn std::error::Error>> {
1330            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1331            extract_pdf_text_from_document(&mut document)
1332        },
1333    ));
1334    if let Ok(Ok(text)) = extracted
1335        && let Some(normalized) = normalize_pdf_text(text)
1336    {
1337        return normalized;
1338    }
1339
1340    let extracted = catch_unwind(AssertUnwindSafe(
1341        || -> Result<String, Box<dyn std::error::Error>> {
1342            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1343            extract_pdf_text_from_document(&mut document)
1344        },
1345    ));
1346    if let Ok(Ok(text)) = extracted
1347        && let Some(normalized) = normalize_pdf_text(text)
1348    {
1349        return normalized;
1350    }
1351
1352    String::new()
1353}
1354
1355fn extract_first_pdf_page_text(
1356    document: &mut pdf_oxide::document::PdfDocument,
1357) -> Result<String, Box<dyn std::error::Error>> {
1358    if document.page_count()? == 0 {
1359        return Ok(String::new());
1360    }
1361
1362    let extracted_text = document.extract_text(0)?;
1363    let markdown_text =
1364        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1365    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1366        return Ok(extracted_text);
1367    }
1368
1369    let pipeline_text =
1370        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1371
1372    Ok(merge_pdf_first_page_text(
1373        &extracted_text,
1374        &markdown_text,
1375        &pipeline_text,
1376    ))
1377}
1378
1379fn extract_pdf_text_from_document(
1380    document: &mut pdf_oxide::document::PdfDocument,
1381) -> Result<String, Box<dyn std::error::Error>> {
1382    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1383}
1384
1385fn normalize_pdf_text(text: String) -> Option<String> {
1386    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1387    (!normalized.trim().is_empty()).then_some(normalized)
1388}
1389
1390fn merge_pdf_first_page_text(
1391    _extracted_text: &str,
1392    markdown_text: &str,
1393    pipeline_text: &str,
1394) -> String {
1395    let pipeline = pipeline_text.trim();
1396    if pipeline.is_empty() {
1397        return String::new();
1398    }
1399
1400    let prefix = pdf_first_page_heading_prefix(markdown_text);
1401    let Some(prefix) = prefix else {
1402        return pipeline_text.to_string();
1403    };
1404
1405    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1406        pipeline_text.to_string()
1407    } else {
1408        format!("{prefix}\n\n{pipeline}")
1409    }
1410}
1411
1412fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1413    normalize_pdf_heading_comparison_text(text)
1414        .contains(&normalize_pdf_heading_comparison_text(prefix))
1415}
1416
1417fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1418    text.split_whitespace()
1419        .map(|part| part.to_ascii_lowercase())
1420        .collect::<Vec<_>>()
1421        .join(" ")
1422}
1423
1424fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1425    let mut lines = Vec::new();
1426
1427    for line in pdf_markdown_heading_lines(markdown_text) {
1428        push_unique_line(&mut lines, line);
1429    }
1430
1431    (!lines.is_empty()).then(|| lines.join("\n"))
1432}
1433
1434fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1435    text.lines()
1436        .map(str::trim)
1437        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1438        .map(|line| line.trim_matches('#').trim())
1439        .filter(|line| !line.is_empty())
1440        .filter(|line| !looks_like_numbered_section_heading(line))
1441        .take(4)
1442        .map(ToOwned::to_owned)
1443        .collect()
1444}
1445
1446fn push_unique_line(lines: &mut Vec<String>, line: String) {
1447    if !lines.iter().any(|existing| existing == &line) {
1448        lines.push(line);
1449    }
1450}
1451
1452fn looks_like_numbered_section_heading(line: &str) -> bool {
1453    let mut chars = line.chars();
1454    let Some(first) = chars.next() else {
1455        return false;
1456    };
1457
1458    if !first.is_ascii_digit() {
1459        return false;
1460    }
1461
1462    matches!(chars.next(), Some('.'))
1463}
1464
1465fn is_zip_archive(bytes: &[u8]) -> bool {
1466    bytes.starts_with(b"PK\x03\x04")
1467        || bytes.starts_with(b"PK\x05\x06")
1468        || bytes.starts_with(b"PK\x07\x08")
1469}
1470
1471pub fn extract_printable_strings(bytes: &[u8]) -> String {
1472    const MIN_LEN: usize = 4;
1473    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1474    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1475
1476    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1477
1478    fn is_printable_ascii(b: u8) -> bool {
1479        matches!(b, 0x20..=0x7E)
1480    }
1481
1482    let mut out = String::new();
1483    let mut run: Vec<u8> = Vec::new();
1484
1485    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1486        if run.len() >= MIN_LEN {
1487            if !out.is_empty() {
1488                out.push('\n');
1489            }
1490            out.push_str(&String::from_utf8_lossy(run));
1491        }
1492        run.clear();
1493    };
1494
1495    for &b in bytes {
1496        if is_printable_ascii(b) {
1497            run.push(b);
1498        } else {
1499            flush_run(&mut out, &mut run);
1500            if out.len() >= max_output_bytes {
1501                return out;
1502            }
1503        }
1504    }
1505    flush_run(&mut out, &mut run);
1506    if out.len() >= max_output_bytes {
1507        return out;
1508    }
1509
1510    for start in 0..=1 {
1511        run.clear();
1512        let mut i = start;
1513        while i + 1 < bytes.len() {
1514            let b0 = bytes[i];
1515            let b1 = bytes[i + 1];
1516            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1517            if is_printable_ascii(ch) && zero == 0 {
1518                run.push(ch);
1519            } else {
1520                flush_run(&mut out, &mut run);
1521                if out.len() >= max_output_bytes {
1522                    return out;
1523                }
1524            }
1525            i += 2;
1526        }
1527        flush_run(&mut out, &mut run);
1528        if out.len() >= max_output_bytes {
1529            return out;
1530        }
1531    }
1532
1533    out
1534}
1535
1536#[cfg(test)]
1537mod tests {
1538    use std::path::Path;
1539
1540    use super::{
1541        ExtractedTextKind, classify_file_info, extract_printable_strings,
1542        extract_text_for_detection, normalize_mime_type, normalize_pdf_heading_comparison_text,
1543    };
1544
1545    #[test]
1546    fn test_extract_text_for_detection_skips_jar_archives() {
1547        let path = Path::new(
1548            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
1549        );
1550        let bytes = std::fs::read(path).expect("failed to read jar fixture");
1551
1552        let (text, kind) = extract_text_for_detection(path, &bytes);
1553
1554        assert!(text.is_empty());
1555        assert_eq!(kind, ExtractedTextKind::None);
1556    }
1557
1558    #[test]
1559    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
1560        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1561        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1562
1563        let (text, kind) = extract_text_for_detection(path, &bytes);
1564
1565        assert_eq!(kind, ExtractedTextKind::Pdf);
1566        assert!(text.contains("Redistribution and use in source and binary forms"));
1567    }
1568
1569    #[test]
1570    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
1571        let path =
1572            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1573        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1574
1575        let (text, kind) = extract_text_for_detection(path, &bytes);
1576
1577        assert_eq!(kind, ExtractedTextKind::Pdf);
1578        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
1579        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
1580    }
1581
1582    #[test]
1583    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
1584        let path =
1585            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
1586        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1587
1588        let (text, kind) = extract_text_for_detection(path, &bytes);
1589
1590        assert_eq!(kind, ExtractedTextKind::Pdf);
1591
1592        let normalized = normalize_pdf_heading_comparison_text(&text);
1593        let heading =
1594            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
1595        assert_eq!(normalized.matches(&heading).count(), 1);
1596    }
1597
1598    #[test]
1599    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
1600        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
1601        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
1602
1603        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
1604
1605        assert_eq!(kind, ExtractedTextKind::Pdf);
1606        assert!(text.contains("Redistribution and use in source and binary forms"));
1607    }
1608
1609    #[test]
1610    fn test_extract_text_for_detection_skips_zip_like_archives() {
1611        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
1612
1613        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
1614        let (crate_text, crate_kind) =
1615            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
1616
1617        assert!(whl_text.is_empty());
1618        assert_eq!(whl_kind, ExtractedTextKind::None);
1619        assert!(crate_text.is_empty());
1620        assert_eq!(crate_kind, ExtractedTextKind::None);
1621    }
1622
1623    #[test]
1624    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
1625        let path =
1626            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
1627        let bytes = std::fs::read(path).expect("failed to read lib fixture");
1628
1629        let (text, kind) = extract_text_for_detection(path, &bytes);
1630
1631        assert_ne!(kind, ExtractedTextKind::None);
1632        assert!(text.contains("Copyright nexB and others (c) 2012"));
1633    }
1634
1635    #[test]
1636    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
1637        let bytes = b"abcd\0".repeat(525_000);
1638
1639        let text = extract_printable_strings(&bytes);
1640
1641        assert!(
1642            text.len() > 2_000_000,
1643            "unexpected truncation at {}",
1644            text.len()
1645        );
1646        assert!(text.ends_with("abcd"));
1647    }
1648
1649    #[test]
1650    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
1651        let path = Path::new(
1652            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
1653        );
1654        let bytes = std::fs::read(path).expect("failed to read svg fixture");
1655
1656        let (text, kind) = extract_text_for_detection(path, &bytes);
1657
1658        assert_eq!(kind, ExtractedTextKind::Decoded);
1659        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
1660    }
1661
1662    #[test]
1663    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
1664        let path = Path::new(
1665            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
1666        );
1667        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
1668
1669        let (text, kind) = extract_text_for_detection(path, &bytes);
1670
1671        assert_eq!(kind, ExtractedTextKind::Decoded);
1672        assert!(text.contains("GNU Lesser General Public"));
1673        assert!(text.contains("version"));
1674        assert!(text.contains("2.1 of the License"));
1675    }
1676
1677    #[test]
1678    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
1679        assert_eq!(
1680            normalize_mime_type(
1681                Path::new("main.ts"),
1682                b"export const answer = 42;\n",
1683                Some("TypeScript"),
1684                "video/mp2t",
1685            ),
1686            "text/plain"
1687        );
1688    }
1689
1690    #[test]
1691    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
1692        assert_eq!(
1693            normalize_mime_type(
1694                Path::new("main.js"),
1695                b"console.log('hello');\n",
1696                Some("JavaScript"),
1697                "application/octet-stream",
1698            ),
1699            "text/plain"
1700        );
1701    }
1702
1703    #[test]
1704    fn test_normalize_mime_type_preserves_binary_video_guess() {
1705        assert_eq!(
1706            normalize_mime_type(
1707                Path::new("main.ts"),
1708                &[0, 159, 146, 150, 0, 1, 2, 3],
1709                Some("TypeScript"),
1710                "video/mp2t",
1711            ),
1712            "video/mp2t"
1713        );
1714    }
1715
1716    #[test]
1717    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
1718        assert_eq!(
1719            normalize_mime_type(
1720                Path::new("main.ts"),
1721                &[0, 159, 146, 150],
1722                Some("TypeScript"),
1723                "application/octet-stream",
1724            ),
1725            "application/octet-stream"
1726        );
1727    }
1728
1729    #[test]
1730    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
1731        let classification = classify_file_info(Path::new("test.txt"), b"");
1732
1733        assert_eq!(classification.mime_type, "inode/x-empty");
1734        assert_eq!(classification.file_type, "empty");
1735        assert!(!classification.is_binary);
1736        assert!(classification.is_text);
1737        assert!(!classification.is_source);
1738        assert_eq!(classification.programming_language, None);
1739    }
1740
1741    #[test]
1742    fn test_classify_file_info_keeps_json_out_of_programming_language() {
1743        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
1744
1745        assert_eq!(classification.mime_type, "application/json");
1746        assert_eq!(classification.file_type, "JSON text data");
1747        assert!(classification.is_text);
1748        assert!(!classification.is_source);
1749        assert_eq!(classification.programming_language, None);
1750    }
1751
1752    #[test]
1753    fn test_classify_file_info_treats_dockerfile_as_source() {
1754        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
1755
1756        assert_eq!(
1757            classification.programming_language.as_deref(),
1758            Some("Dockerfile")
1759        );
1760        assert!(classification.is_source);
1761        assert!(!classification.is_script);
1762        assert_eq!(classification.file_type, "UTF-8 Unicode text");
1763    }
1764
1765    #[test]
1766    fn test_classify_file_info_treats_makefile_as_text_not_source() {
1767        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
1768
1769        assert_eq!(classification.programming_language, None);
1770        assert!(classification.is_text);
1771        assert!(!classification.is_source);
1772        assert!(!classification.is_script);
1773        assert_eq!(classification.file_type, "UTF-8 Unicode text");
1774    }
1775
1776    #[test]
1777    fn test_classify_file_info_marks_supported_package_archives() {
1778        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
1779
1780        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
1781        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
1782
1783        assert!(egg.is_archive);
1784        assert_eq!(egg.mime_type, "application/zip");
1785        assert_eq!(egg.file_type, "Zip archive data");
1786        assert!(nupkg.is_archive);
1787        assert_eq!(nupkg.mime_type, "application/zip");
1788        assert_eq!(nupkg.file_type, "Zip archive data");
1789    }
1790
1791    #[test]
1792    fn test_classify_file_info_marks_png_as_binary_media() {
1793        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
1794
1795        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
1796
1797        assert_eq!(classification.mime_type, "image/png");
1798        assert_eq!(classification.file_type, "PNG image data");
1799        assert!(classification.is_binary);
1800        assert!(!classification.is_text);
1801        assert!(classification.is_media);
1802        assert!(!classification.is_archive);
1803        assert!(!classification.is_source);
1804    }
1805
1806    #[test]
1807    fn test_classify_file_info_marks_pdf_as_binary_document() {
1808        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
1809
1810        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
1811
1812        assert_eq!(classification.mime_type, "application/pdf");
1813        assert_eq!(classification.file_type, "PDF document");
1814        assert!(classification.is_binary);
1815        assert!(!classification.is_text);
1816        assert!(!classification.is_archive);
1817        assert!(!classification.is_media);
1818    }
1819
1820    #[test]
1821    fn test_classify_file_info_marks_binary_blobs_as_binary() {
1822        let classification =
1823            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
1824
1825        assert!(classification.is_binary);
1826        assert!(!classification.is_text);
1827        assert!(!classification.is_source);
1828        assert_eq!(classification.programming_language, None);
1829    }
1830
1831    #[test]
1832    fn test_classify_file_info_treats_yaml_as_text_not_source() {
1833        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
1834
1835        assert_eq!(classification.programming_language, None);
1836        assert!(classification.is_text);
1837        assert!(!classification.is_source);
1838        assert_eq!(classification.file_type, "YAML text data");
1839    }
1840
1841    #[test]
1842    fn test_classify_file_info_classifies_common_build_manifests() {
1843        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
1844        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
1845        let gitmodules = classify_file_info(
1846            Path::new(".gitmodules"),
1847            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
1848        );
1849
1850        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
1851        assert!(gradle.is_source);
1852        assert_eq!(gradle.mime_type, "text/plain");
1853
1854        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
1855        assert!(flake.is_source);
1856        assert_eq!(flake.mime_type, "text/plain");
1857
1858        assert_eq!(gitmodules.programming_language, None);
1859        assert!(gitmodules.is_text);
1860        assert!(!gitmodules.is_source);
1861        assert_eq!(gitmodules.file_type, "Git configuration text");
1862    }
1863
1864    #[test]
1865    fn test_classify_file_info_labels_javascript_shebang_scripts() {
1866        let classification = classify_file_info(
1867            Path::new("bin/run"),
1868            b"#!/usr/bin/env node\nconsole.log('hello');\n",
1869        );
1870
1871        assert_eq!(
1872            classification.programming_language.as_deref(),
1873            Some("JavaScript")
1874        );
1875        assert!(classification.is_script);
1876        assert_eq!(
1877            classification.file_type,
1878            "javascript script, UTF-8 Unicode text executable"
1879        );
1880    }
1881
1882    #[test]
1883    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
1884        let classification = classify_file_info(
1885            Path::new("script.py"),
1886            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
1887        );
1888
1889        assert_eq!(
1890            classification.programming_language.as_deref(),
1891            Some("Python")
1892        );
1893        assert!(classification.is_script);
1894        assert_eq!(classification.file_type, "python script, text executable");
1895    }
1896
1897    #[test]
1898    fn test_classify_file_info_treats_textual_tga_as_media() {
1899        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
1900
1901        assert!(classification.is_media);
1902        assert!(classification.is_text);
1903        assert!(!classification.is_binary);
1904    }
1905
1906    #[test]
1907    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
1908        let classification =
1909            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
1910
1911        assert!(classification.is_binary);
1912        assert!(!classification.is_text);
1913        assert!(!classification.is_source);
1914        assert_eq!(classification.programming_language, None);
1915    }
1916
1917    #[test]
1918    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
1919        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
1920
1921        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
1922
1923        assert!(text.is_empty());
1924        assert_eq!(kind, ExtractedTextKind::None);
1925    }
1926
1927    #[test]
1928    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
1929        let cases = [
1930            (
1931                Path::new("bin/run"),
1932                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
1933                Some("JavaScript"),
1934                true,
1935                true,
1936            ),
1937            (
1938                Path::new("Dockerfile"),
1939                b"FROM scratch\n".as_slice(),
1940                Some("Dockerfile"),
1941                true,
1942                false,
1943            ),
1944            (
1945                Path::new("package.json"),
1946                br#"{"name":"demo"}"#.as_slice(),
1947                None,
1948                false,
1949                false,
1950            ),
1951            (
1952                Path::new("config.yaml"),
1953                b"key: value\n".as_slice(),
1954                None,
1955                false,
1956                false,
1957            ),
1958            (
1959                Path::new("Makefile"),
1960                b"all:\n\techo hi\n".as_slice(),
1961                None,
1962                false,
1963                false,
1964            ),
1965        ];
1966
1967        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
1968            let classification = classify_file_info(path, bytes);
1969
1970            assert_eq!(
1971                classification.programming_language.as_deref(),
1972                expected_language,
1973                "unexpected language for {}",
1974                path.display()
1975            );
1976            assert_eq!(
1977                classification.is_source,
1978                expected_is_source,
1979                "unexpected is_source for {}",
1980                path.display()
1981            );
1982            assert_eq!(
1983                classification.is_script,
1984                expected_is_script,
1985                "unexpected is_script for {}",
1986                path.display()
1987            );
1988        }
1989    }
1990}
provenant/utils/file.rs

provenant/utils/
file.rs