Skip to main content

provenant/utils/
file.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26    None,
27    Decoded,
28    FontMetadata,
29    Pdf,
30    BinaryStrings,
31    ImageMetadata,
32    WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37    pub mime_type: String,
38    pub file_type: String,
39    pub programming_language: Option<String>,
40    pub is_binary: bool,
41    pub is_text: bool,
42    pub is_archive: bool,
43    pub is_media: bool,
44    pub is_source: bool,
45    pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64/// Get the last modified date of a file as a `YYYY-MM-DD` string.
65pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66    metadata.modified().ok().map(|time: std::time::SystemTime| {
67        let seconds_since_epoch = time
68            .duration_since(std::time::UNIX_EPOCH)
69            .unwrap()
70            .as_secs() as i64;
71
72        Utc.timestamp_opt(seconds_since_epoch, 0)
73            .single()
74            .unwrap_or_else(Utc::now)
75            .format("%Y-%m-%d")
76            .to_string()
77    })
78}
79
80/// Check if a path should be excluded based on a list of glob patterns.
81pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82    let path_str = path.to_string_lossy();
83    let file_name = path
84        .file_name()
85        .map(|name| name.to_string_lossy())
86        .unwrap_or_default();
87
88    for pattern in exclude_patterns {
89        // Match against full path
90        if pattern.matches(&path_str) {
91            return true;
92        }
93
94        // Match against just the file/directory name
95        if pattern.matches(&file_name) {
96            return true;
97        }
98    }
99
100    false
101}
102
103/// Decode a byte buffer to a String, trying UTF-16 first when the byte shape
104/// strongly suggests it, then UTF-8, then Latin-1.
105///
106/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
107/// so it can decode any byte sequence. This matches Python ScanCode's use of
108/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
109pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
110    if let Some(decoded) = decode_utf16_text(bytes) {
111        return decoded;
112    }
113
114    match String::from_utf8(bytes.to_vec()) {
115        Ok(s) => s,
116        Err(e) => {
117            let bytes = e.into_bytes();
118            if has_binary_control_chars(&bytes) {
119                return String::new();
120            }
121            bytes.iter().map(|&b| b as char).collect()
122        }
123    }
124}
125
126pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
127    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
128    (text, kind)
129}
130
131pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
132    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
133        return Cow::Borrowed(text);
134    };
135    if !matches!(
136        extension.to_ascii_lowercase().as_str(),
137        "md" | "markdown" | "html" | "htm"
138    ) {
139        return Cow::Borrowed(text);
140    }
141
142    let mut hints = Vec::new();
143    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
144        hints.push("Creative Commons Attribution 4.0 International License".to_string());
145    }
146    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
147    {
148        hints.push(
149            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
150                .to_string(),
151        );
152    }
153
154    hints.extend(extract_shields_license_badge_hints(text));
155
156    if hints.is_empty() {
157        Cow::Borrowed(text)
158    } else {
159        let mut augmented =
160            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
161        augmented.push_str(text);
162        augmented.push_str("\n\n");
163        for (index, hint) in hints.into_iter().enumerate() {
164            if index > 0 {
165                augmented.push('\n');
166            }
167            augmented.push_str(&hint);
168        }
169        Cow::Owned(augmented)
170    }
171}
172
173fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
174    let mut hints = Vec::new();
175    let mut rest = text;
176    let needle = "img.shields.io/badge/license-";
177
178    while let Some(index) = rest.find(needle) {
179        let start = index + needle.len();
180        let suffix = &rest[start..];
181        let end = suffix
182            .find([')', ']', '"', '\'', ' ', '\n'])
183            .unwrap_or(suffix.len());
184        let badge = &suffix[..end];
185        let Some(badge) = badge.strip_suffix(".svg") else {
186            rest = &suffix[end..];
187            continue;
188        };
189
190        let mut segments: Vec<_> = badge
191            .split('-')
192            .filter(|segment| !segment.is_empty())
193            .collect();
194        if segments.len() < 2 {
195            rest = &suffix[end..];
196            continue;
197        }
198        segments.pop();
199        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
200        if !candidate.is_empty() {
201            hints.push(canonical_shields_license_hint(&candidate));
202        }
203
204        rest = &suffix[end..];
205    }
206
207    hints.sort();
208    hints.dedup();
209    hints
210}
211
212fn canonical_shields_license_hint(candidate: &str) -> String {
213    match candidate.trim() {
214        "MIT" => "The MIT License".to_string(),
215        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
216        other => format!("{other} License"),
217    }
218}
219
220pub(crate) fn extract_text_for_detection_with_diagnostics(
221    path: &Path,
222    bytes: &[u8],
223) -> (String, ExtractedTextKind, Option<String>) {
224    let ext = path
225        .extension()
226        .and_then(|e| e.to_str())
227        .map(|s| s.to_ascii_lowercase());
228    let detected_format = detect_file_format(bytes);
229
230    if looks_like_rtf(bytes, ext.as_deref()) {
231        let text = extract_rtf_text(bytes);
232        return if text.trim().is_empty() {
233            (String::new(), ExtractedTextKind::None, None)
234        } else {
235            (text, ExtractedTextKind::Decoded, None)
236        };
237    }
238
239    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
240        let (text, scan_error) = extract_pdf_text(path, bytes);
241        return if text.is_empty() {
242            (String::new(), ExtractedTextKind::None, scan_error)
243        } else {
244            (text, ExtractedTextKind::Pdf, None)
245        };
246    }
247
248    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
249        let text = extract_image_metadata_text(bytes, format);
250        return if text.is_empty() {
251            if is_supported_image_container(bytes, format) {
252                (String::new(), ExtractedTextKind::None, None)
253            } else {
254                let decoded = decode_bytes_to_string(bytes);
255                if decoded.is_empty() {
256                    (String::new(), ExtractedTextKind::None, None)
257                } else {
258                    (decoded, ExtractedTextKind::Decoded, None)
259                }
260            }
261        } else {
262            (text, ExtractedTextKind::ImageMetadata, None)
263        };
264    }
265
266    if let Some(text) = extract_font_metadata_text(path, bytes) {
267        let strings = extract_printable_strings(bytes);
268        let combined = if strings.is_empty() {
269            text
270        } else {
271            combine_extracted_text_fragments(Some(text), strings)
272        };
273        return (combined, ExtractedTextKind::FontMetadata, None);
274    }
275
276    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
277    let large_opaque_binary = windows_executable_metadata_text.is_none()
278        && is_large_opaque_binary_candidate(bytes, detected_format);
279
280    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
281        return windows_metadata_or_empty_result(windows_executable_metadata_text);
282    }
283
284    if should_skip_binary_string_extraction(path, bytes, detected_format) {
285        return (String::new(), ExtractedTextKind::None, None);
286    }
287
288    if !large_opaque_binary {
289        let decoded = decode_bytes_to_string(bytes);
290        if !decoded.is_empty() {
291            let combined =
292                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
293            return (combined, ExtractedTextKind::Decoded, None);
294        }
295    }
296
297    let text = if large_opaque_binary {
298        extract_sampled_printable_strings(bytes)
299    } else {
300        extract_printable_strings(bytes)
301    };
302    if text.is_empty() {
303        windows_metadata_or_empty_result(windows_executable_metadata_text)
304    } else {
305        (
306            combine_extracted_text_fragments(windows_executable_metadata_text, text),
307            ExtractedTextKind::BinaryStrings,
308            None,
309        )
310    }
311}
312
313fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
314    match prefix {
315        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
316        Some(prefix) if !prefix.is_empty() => prefix,
317        _ => suffix,
318    }
319}
320
321fn windows_metadata_or_empty_result(
322    windows_executable_metadata_text: Option<String>,
323) -> (String, ExtractedTextKind, Option<String>) {
324    if let Some(metadata_text) = windows_executable_metadata_text {
325        (
326            metadata_text,
327            ExtractedTextKind::WindowsExecutableMetadata,
328            None,
329        )
330    } else {
331        (String::new(), ExtractedTextKind::None, None)
332    }
333}
334
335pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
336    let detected_format = detect_file_format(bytes);
337    let detected_language = detect_language(path, bytes);
338    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
339    let is_text = !is_binary;
340    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
341    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
342    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
343    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
344    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
345    let programming_language = is_source.then(|| detected_language.clone()).flatten();
346    let file_type = detect_file_type(
347        path,
348        bytes,
349        detected_format,
350        &mime_type,
351        programming_language.as_deref(),
352        is_binary,
353        is_text,
354        is_archive,
355        is_media,
356        is_script,
357    );
358
359    FileInfoClassification {
360        mime_type,
361        file_type,
362        programming_language,
363        is_binary,
364        is_text,
365        is_archive,
366        is_media,
367        is_source,
368        is_script,
369    }
370}
371
372fn detect_file_format(bytes: &[u8]) -> FileFormat {
373    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
374}
375
376const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
377
378fn is_utf8_text(bytes: &[u8]) -> bool {
379    std::str::from_utf8(bytes).is_ok()
380}
381
382fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
383    bytes
384        .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
385        .unwrap_or(bytes)
386}
387
388fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
389    if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
390        return None;
391    }
392
393    let code_units: Vec<u16> = bytes
394        .chunks_exact(2)
395        .map(|chunk| {
396            if is_le {
397                u16::from_le_bytes([chunk[0], chunk[1]])
398            } else {
399                u16::from_be_bytes([chunk[0], chunk[1]])
400            }
401        })
402        .collect();
403
404    let decoded = std::char::decode_utf16(code_units)
405        .collect::<Result<String, _>>()
406        .ok()?;
407
408    if !require_text_shape {
409        return (!decoded.contains('\0')).then_some(decoded);
410    }
411
412    let visible = decoded
413        .chars()
414        .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
415        .count();
416    if visible < 3 || decoded.contains('\0') {
417        return None;
418    }
419
420    let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
421    let punctuation = decoded
422        .chars()
423        .filter(|ch| {
424            matches!(
425                ch,
426                '{' | '}'
427                    | '['
428                    | ']'
429                    | '<'
430                    | '>'
431                    | '('
432                    | ')'
433                    | ':'
434                    | ';'
435                    | ','
436                    | '"'
437                    | '\''
438                    | '/'
439                    | '='
440                    | '-'
441                    | '_'
442                    | '#'
443                    | '!'
444            )
445        })
446        .count();
447    let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
448
449    let textish = alpha + punctuation + whitespace;
450    if textish + (visible / 5) < visible || (alpha == 0 && punctuation < 2) {
451        return None;
452    }
453
454    Some(decoded)
455}
456
457fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
458    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
459    if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
460        return None;
461    }
462
463    let pair_count = stripped.len() / 2;
464    let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
465    let odd_zero = stripped
466        .iter()
467        .skip(1)
468        .step_by(2)
469        .filter(|&&b| b == 0)
470        .count();
471
472    let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
473    let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
474
475    match (looks_like_le, looks_like_be) {
476        (true, false) => Some(true),
477        (false, true) => Some(false),
478        (true, true) => Some(true),
479        (false, false) => None,
480    }
481}
482
483fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
484    if let Some(decoded) = decode_utf16_bom_text(bytes) {
485        return Some(decoded);
486    }
487
488    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
489    match detect_utf16_endianness(bytes) {
490        Some(true) => decode_utf16_units(stripped, true, true),
491        Some(false) => decode_utf16_units(stripped, false, true),
492        None => None,
493    }
494}
495
496fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
497    if bytes.len() >= 2 {
498        let (is_le, body) = match bytes {
499            [0xFF, 0xFE, rest @ ..] => (true, rest),
500            [0xFE, 0xFF, rest @ ..] => (false, rest),
501            _ => {
502                let stripped = strip_corrupted_utf16_bom_prefix(bytes);
503                return match detect_utf16_endianness(bytes) {
504                    Some(true) => decode_utf16_units(stripped, true, false),
505                    Some(false) => decode_utf16_units(stripped, false, false),
506                    None => None,
507                };
508            }
509        };
510
511        if body.is_empty() || !body.len().is_multiple_of(2) {
512            return None;
513        }
514
515        return decode_utf16_units(body, is_le, false);
516    }
517
518    None
519}
520
521fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
522    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
523        return None;
524    }
525
526    let (is_le, body) = match bytes {
527        [0xFF, 0xFE, rest @ ..] => (true, rest),
528        [0xFE, 0xFF, rest @ ..] => (false, rest),
529        _ => return None,
530    };
531
532    if body.is_empty() || body.len() % 2 != 0 {
533        return None;
534    }
535
536    decode_utf16_units(body, is_le, true)
537}
538
539fn has_binary_control_chars(bytes: &[u8]) -> bool {
540    let control_count = bytes
541        .iter()
542        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
543        .count();
544    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
545}
546
547fn has_decodable_text(bytes: &[u8]) -> bool {
548    bytes.is_empty()
549        || is_utf8_text(bytes)
550        || decode_utf16_text(bytes).is_some()
551        || !has_binary_control_chars(bytes)
552}
553
554fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
555    if bytes.is_empty() || is_utf8_text(bytes) {
556        return true;
557    }
558    if let Some(decoded) = decode_utf16_text(bytes) {
559        return decoded
560            .chars()
561            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
562    }
563
564    let printable_count = bytes
565        .iter()
566        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
567        .count();
568    printable_count * 2 >= bytes.len()
569}
570
571fn is_textual_media_type(media_type: &str) -> bool {
572    media_type.starts_with("text/")
573        || matches!(
574            media_type,
575            "application/json" | "application/xml" | "text/xml"
576        )
577        || media_type.ends_with("+json")
578        || media_type.ends_with("+xml")
579}
580
581fn is_textual_format(detected_format: FileFormat) -> bool {
582    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
583        || is_textual_media_type(detected_format.media_type())
584}
585
586fn is_known_binary_format(detected_format: FileFormat) -> bool {
587    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
588        && !is_textual_format(detected_format)
589}
590
591pub fn detect_mime_type(
592    path: &Path,
593    bytes: &[u8],
594    detected_format: FileFormat,
595    programming_language: Option<&str>,
596) -> String {
597    if bytes.is_empty() {
598        return "inode/x-empty".to_string();
599    }
600
601    if lower_extension(path).as_deref() == Some("json") {
602        if let Some(is_binary) = json_binary_override(bytes) {
603            if is_binary {
604                return "application/octet-stream".to_string();
605            }
606            if has_valid_json_text(bytes) {
607                return "application/json".to_string();
608            }
609            return "text/plain".to_string();
610        }
611        if has_valid_json_text(bytes) {
612            return "application/json".to_string();
613        }
614        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
615            return "text/plain".to_string();
616        }
617        return "application/octet-stream".to_string();
618    }
619
620    if is_zip_archive(bytes) {
621        return detect_zip_like_mime(path);
622    }
623
624    if looks_like_deb(bytes, path) {
625        return "application/vnd.debian.binary-package".to_string();
626    }
627
628    if looks_like_rpm(bytes, path) {
629        return "application/x-rpm".to_string();
630    }
631
632    let guessed_mime = from_path(path)
633        .first_or_octet_stream()
634        .essence_str()
635        .to_string();
636
637    let mime_type = match detected_format {
638        FileFormat::Empty => "inode/x-empty".to_string(),
639        FileFormat::PlainText => {
640            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
641                "text/plain".to_string()
642            } else {
643                guessed_mime.clone()
644            }
645        }
646        _ => {
647            let detected_mime = detected_format.media_type();
648            if detected_mime == "application/octet-stream"
649                && guessed_mime != "application/octet-stream"
650            {
651                guessed_mime.clone()
652            } else {
653                detected_mime.to_string()
654            }
655        }
656    };
657
658    normalize_mime_type(path, bytes, programming_language, &mime_type)
659}
660
661fn normalize_mime_type(
662    path: &Path,
663    bytes: &[u8],
664    programming_language: Option<&str>,
665    mime_type: &str,
666) -> String {
667    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
668        return "text/plain".to_string();
669    }
670
671    mime_type.to_string()
672}
673
674fn should_prefer_text_mime(
675    path: &Path,
676    bytes: &[u8],
677    programming_language: Option<&str>,
678    mime_type: &str,
679) -> bool {
680    has_decodable_text(bytes)
681        && looks_like_textual_bytes(bytes)
682        && is_textual_source_candidate(path, programming_language)
683        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
684}
685
686fn has_valid_json_text(bytes: &[u8]) -> bool {
687    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
688        return false;
689    }
690
691    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
692        || decode_utf16_json_text(bytes)
693            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
694            .is_some()
695}
696
697fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
698    !bytes.contains(&0)
699        && !bytes.contains(&0xFF)
700        && bytes.starts_with(b"[\"")
701        && bytes.ends_with(b"\"]")
702        && bytes.len() >= 8
703}
704
705fn json_binary_override(bytes: &[u8]) -> Option<bool> {
706    if has_valid_json_text(bytes) {
707        return Some(false);
708    }
709
710    if bytes.contains(&0) {
711        return Some(true);
712    }
713
714    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
715        return Some(true);
716    }
717
718    if is_wrapped_invalid_json_string_text(bytes) {
719        return Some(false);
720    }
721
722    None
723}
724
725fn detect_is_binary(
726    path: &Path,
727    bytes: &[u8],
728    detected_format: FileFormat,
729    programming_language: Option<&str>,
730) -> bool {
731    if lower_extension(path).as_deref() == Some("json")
732        && let Some(is_binary) = json_binary_override(bytes)
733    {
734        return is_binary;
735    }
736
737    if is_textual_format(detected_format) {
738        return false;
739    }
740
741    if lower_extension(path)
742        .as_deref()
743        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
744    {
745        return true;
746    }
747
748    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
749        return false;
750    }
751
752    has_binary_control_chars(bytes)
753        || is_known_binary_format(detected_format)
754        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
755            && !looks_like_textual_bytes(bytes))
756}
757
758fn should_treat_binary_bytes_as_text(
759    path: &Path,
760    bytes: &[u8],
761    programming_language: Option<&str>,
762) -> bool {
763    has_decodable_text(bytes)
764        && looks_like_textual_bytes(bytes)
765        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
766}
767
768fn detect_is_archive(
769    path: &Path,
770    bytes: &[u8],
771    mime_type: &str,
772    is_text: bool,
773    detected_format: FileFormat,
774) -> bool {
775    if is_text {
776        return false;
777    }
778
779    lower_extension(path)
780        .as_deref()
781        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
782        || matches!(
783            detected_format.kind(),
784            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
785        )
786        || is_zip_archive(bytes)
787        || looks_like_gzip(bytes)
788        || looks_like_bzip2(bytes)
789        || looks_like_xz(bytes)
790        || looks_like_deb(bytes, path)
791        || looks_like_rpm(bytes, path)
792        || looks_like_squashfs(bytes, path)
793        || mime_type.contains("zip")
794        || mime_type.contains("compressed")
795        || mime_type.contains("tar")
796        || mime_type.contains("x-rpm")
797        || mime_type.contains("debian")
798}
799
800fn detect_is_media(
801    path: &Path,
802    bytes: &[u8],
803    mime_type: &str,
804    detected_format: FileFormat,
805) -> bool {
806    media_mime_from_content(bytes).is_some()
807        || matches!(
808            detected_format.kind(),
809            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
810        )
811        || mime_type.starts_with("image/")
812        || mime_type.starts_with("audio/")
813        || mime_type.starts_with("video/")
814        || (mime_type == "application/octet-stream"
815            && lower_extension(path).as_deref() == Some("tga")
816            && !has_binary_control_chars(bytes))
817}
818
819fn detect_is_script(
820    path: &Path,
821    bytes: &[u8],
822    programming_language: Option<&str>,
823    is_text: bool,
824) -> bool {
825    if !is_text || is_makefile(path) {
826        return false;
827    }
828
829    bytes.starts_with(b"#!")
830        || lower_extension(path).as_deref().is_some_and(|ext| {
831            matches!(
832                ext,
833                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
834            )
835        })
836        || matches!(
837            programming_language,
838            Some(
839                "Shell"
840                    | "Bash"
841                    | "Zsh"
842                    | "Fish"
843                    | "Ksh"
844                    | "Python"
845                    | "Ruby"
846                    | "Perl"
847                    | "PHP"
848                    | "PowerShell"
849                    | "Awk"
850            )
851        )
852}
853
854fn detect_is_source(
855    path: &Path,
856    programming_language: Option<&str>,
857    is_text: bool,
858    is_script: bool,
859) -> bool {
860    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
861        return false;
862    }
863
864    if is_c_like_source(path) || is_java_like_source(path) {
865        return true;
866    }
867
868    programming_language.is_some() || is_script
869}
870
871#[allow(clippy::too_many_arguments)]
872fn detect_file_type(
873    path: &Path,
874    bytes: &[u8],
875    detected_format: FileFormat,
876    mime_type: &str,
877    programming_language: Option<&str>,
878    is_binary: bool,
879    is_text: bool,
880    is_archive: bool,
881    is_media: bool,
882    is_script: bool,
883) -> String {
884    if bytes.is_empty() {
885        return "empty".to_string();
886    }
887
888    if looks_like_pdf(bytes) {
889        return "PDF document".to_string();
890    }
891
892    if let Some(file_type) = media_file_type_from_content(bytes) {
893        return file_type.to_string();
894    }
895
896    if is_archive {
897        return archive_file_type(path, bytes, detected_format);
898    }
899
900    if is_script {
901        return script_file_type(programming_language, bytes);
902    }
903
904    if is_text {
905        if lower_extension(path).as_deref() == Some("json") {
906            if has_valid_json_text(bytes) {
907                return "JSON text data".to_string();
908            }
909            return text_file_type(bytes);
910        }
911        if lower_extension(path).as_deref() == Some("xml") {
912            return "XML text data".to_string();
913        }
914        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
915            return "YAML text data".to_string();
916        }
917        if lower_extension(path).as_deref() == Some("toml") {
918            return "TOML text data".to_string();
919        }
920        if matches!(
921            lower_extension(path).as_deref(),
922            Some("ini" | "cfg" | "conf")
923        ) {
924            return "INI text data".to_string();
925        }
926        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
927            return "Git configuration text".to_string();
928        }
929        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
930            return text_file_type(bytes);
931        }
932        if programming_language.is_some() && !is_media {
933            return source_file_type(programming_language, bytes);
934        }
935        return text_file_type(bytes);
936    }
937
938    if let Some(file_type) = format_based_file_type(detected_format) {
939        return file_type;
940    }
941
942    if is_binary && mime_type == "application/octet-stream" {
943        return "data".to_string();
944    }
945
946    mime_type.to_string()
947}
948
949fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
950    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
951        return true;
952    }
953
954    if matches!(
955        lower_file_name(path).as_str(),
956        "dockerfile"
957            | "containerfile"
958            | "containerfile.core"
959            | "apkbuild"
960            | "podfile"
961            | "jamfile"
962            | "jamroot"
963            | "meson.build"
964            | "build"
965            | "workspace"
966            | "buck"
967            | "default.nix"
968            | "flake.nix"
969            | "shell.nix"
970    ) {
971        return true;
972    }
973
974    path.extension()
975        .and_then(|ext| ext.to_str())
976        .is_some_and(|ext| {
977            matches!(
978                ext.to_ascii_lowercase().as_str(),
979                "rs" | "py"
980                    | "js"
981                    | "mjs"
982                    | "cjs"
983                    | "jsx"
984                    | "ts"
985                    | "mts"
986                    | "cts"
987                    | "tsx"
988                    | "c"
989                    | "cpp"
990                    | "cc"
991                    | "cxx"
992                    | "h"
993                    | "hpp"
994                    | "m"
995                    | "mm"
996                    | "s"
997                    | "asm"
998                    | "java"
999                    | "go"
1000                    | "rb"
1001                    | "php"
1002                    | "pl"
1003                    | "swift"
1004                    | "sh"
1005                    | "bash"
1006                    | "zsh"
1007                    | "fish"
1008                    | "ksh"
1009                    | "ps1"
1010                    | "psm1"
1011                    | "psd1"
1012                    | "awk"
1013                    | "kt"
1014                    | "kts"
1015                    | "dart"
1016                    | "scala"
1017                    | "groovy"
1018                    | "gradle"
1019                    | "gvy"
1020                    | "gy"
1021                    | "gsh"
1022                    | "cs"
1023                    | "fs"
1024                    | "fsx"
1025                    | "r"
1026                    | "lua"
1027                    | "jl"
1028                    | "ex"
1029                    | "exs"
1030                    | "clj"
1031                    | "cljs"
1032                    | "cljc"
1033                    | "hs"
1034                    | "erl"
1035                    | "nix"
1036                    | "zig"
1037                    | "bzl"
1038                    | "bazel"
1039                    | "star"
1040                    | "sky"
1041                    | "ml"
1042                    | "mli"
1043                    | "tex"
1044            )
1045        })
1046}
1047
1048fn is_source_like_language(language: &str) -> bool {
1049    matches!(
1050        language,
1051        "Rust"
1052            | "Python"
1053            | "JavaScript"
1054            | "TypeScript"
1055            | "JavaScript/TypeScript"
1056            | "C"
1057            | "C++"
1058            | "Objective-C"
1059            | "Objective-C++"
1060            | "GAS"
1061            | "Java"
1062            | "Go"
1063            | "Ruby"
1064            | "PHP"
1065            | "Perl"
1066            | "Swift"
1067            | "Shell"
1068            | "PowerShell"
1069            | "Awk"
1070            | "Kotlin"
1071            | "Dart"
1072            | "Scala"
1073            | "C#"
1074            | "F#"
1075            | "R"
1076            | "Lua"
1077            | "Julia"
1078            | "Elixir"
1079            | "Clojure"
1080            | "Haskell"
1081            | "Erlang"
1082            | "Groovy"
1083            | "Nix"
1084            | "Zig"
1085            | "Starlark"
1086            | "OCaml"
1087            | "Meson"
1088            | "TeX"
1089            | "Dockerfile"
1090            | "Makefile"
1091            | "Jamfile"
1092    )
1093}
1094
1095fn extension(path: &Path) -> Option<&str> {
1096    path.extension().and_then(|ext| ext.to_str())
1097}
1098
1099fn lower_extension(path: &Path) -> Option<String> {
1100    extension(path).map(|ext| ext.to_ascii_lowercase())
1101}
1102
1103fn lower_file_name(path: &Path) -> String {
1104    path.file_name()
1105        .and_then(|name| name.to_str())
1106        .map(|name| name.to_ascii_lowercase())
1107        .unwrap_or_default()
1108}
1109
1110fn is_plain_text(path: &Path) -> bool {
1111    lower_extension(path)
1112        .as_deref()
1113        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1114}
1115
1116fn is_makefile(path: &Path) -> bool {
1117    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1118}
1119
1120fn is_source_map(path: &Path) -> bool {
1121    let path_lower = path.to_string_lossy().to_ascii_lowercase();
1122    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1123}
1124
1125fn is_c_like_source(path: &Path) -> bool {
1126    lower_extension(path).as_deref().is_some_and(|ext| {
1127        matches!(
1128            ext,
1129            "c" | "cc"
1130                | "cp"
1131                | "cpp"
1132                | "cxx"
1133                | "c++"
1134                | "h"
1135                | "hh"
1136                | "hpp"
1137                | "hxx"
1138                | "h++"
1139                | "i"
1140                | "ii"
1141                | "m"
1142                | "s"
1143                | "asm"
1144        )
1145    })
1146}
1147
1148fn is_java_like_source(path: &Path) -> bool {
1149    lower_extension(path)
1150        .as_deref()
1151        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1152}
1153
1154fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1155    match detected_format {
1156        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1157        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1158        format => Some(match format.kind() {
1159            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1160            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1161            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1162            _ => format.name().to_string(),
1163        }),
1164    }
1165}
1166
1167fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1168    format
1169        .short_name()
1170        .map(|short_name| format!("{short_name} {suffix}"))
1171        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1172}
1173
1174fn detect_zip_like_mime(path: &Path) -> String {
1175    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1176        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1177        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1178            "application/java-archive".to_string()
1179        }
1180        _ => "application/zip".to_string(),
1181    }
1182}
1183
1184fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1185    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1186        Some("image/png")
1187    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1188        Some("image/jpeg")
1189    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1190        Some("image/tiff")
1191    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1192        Some("image/webp")
1193    } else {
1194        None
1195    }
1196}
1197
1198fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1199    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1200        Some("PNG image data")
1201    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1202        Some("JPEG image data")
1203    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1204        Some("TIFF image data")
1205    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1206        Some("WebP image data")
1207    } else {
1208        None
1209    }
1210}
1211
1212fn looks_like_pdf(bytes: &[u8]) -> bool {
1213    bytes.starts_with(b"%PDF-")
1214}
1215
1216fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1217    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1218}
1219
1220fn extract_rtf_text(bytes: &[u8]) -> String {
1221    let text = String::from_utf8_lossy(bytes);
1222    let chars: Vec<char> = text.chars().collect();
1223    let mut output = String::new();
1224    let mut index = 0usize;
1225
1226    while index < chars.len() {
1227        match chars[index] {
1228            '{' | '}' => {
1229                index += 1;
1230            }
1231            '\\' => {
1232                index += 1;
1233                if index >= chars.len() {
1234                    break;
1235                }
1236
1237                match chars[index] {
1238                    '\\' | '{' | '}' => {
1239                        output.push(chars[index]);
1240                        index += 1;
1241                    }
1242                    '\'' => {
1243                        if index + 2 < chars.len() {
1244                            let hex = [chars[index + 1], chars[index + 2]];
1245                            let hex: String = hex.iter().collect();
1246                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1247                                output.push(value as char);
1248                                index += 3;
1249                                continue;
1250                            }
1251                        }
1252                        index += 1;
1253                    }
1254                    control if control.is_ascii_alphabetic() => {
1255                        let start = index;
1256                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1257                            index += 1;
1258                        }
1259                        let control_word: String = chars[start..index].iter().collect();
1260
1261                        let number_start = index;
1262                        if index < chars.len()
1263                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1264                        {
1265                            index += 1;
1266                            while index < chars.len() && chars[index].is_ascii_digit() {
1267                                index += 1;
1268                            }
1269                        }
1270                        let parameter: String = chars[number_start..index].iter().collect();
1271
1272                        if index < chars.len() && chars[index] == ' ' {
1273                            index += 1;
1274                        }
1275
1276                        match control_word.as_str() {
1277                            "par" | "line" => output.push('\n'),
1278                            "tab" => output.push('\t'),
1279                            "emdash" => output.push('—'),
1280                            "endash" => output.push('–'),
1281                            "bullet" => output.push('•'),
1282                            "lquote" | "rquote" => output.push('\''),
1283                            "ldblquote" | "rdblquote" => output.push('"'),
1284                            "u" => {
1285                                if let Ok(codepoint) = parameter.parse::<i32>() {
1286                                    let normalized = if codepoint < 0 {
1287                                        codepoint + 65_536
1288                                    } else {
1289                                        codepoint
1290                                    };
1291                                    if let Ok(normalized) = u32::try_from(normalized)
1292                                        && let Some(ch) = char::from_u32(normalized)
1293                                    {
1294                                        output.push(ch);
1295                                    }
1296                                }
1297
1298                                if index < chars.len()
1299                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1300                                {
1301                                    index += 1;
1302                                }
1303                            }
1304                            _ => {}
1305                        }
1306                    }
1307                    _ => {
1308                        index += 1;
1309                    }
1310                }
1311            }
1312            ch => {
1313                output.push(ch);
1314                index += 1;
1315            }
1316        }
1317    }
1318
1319    output
1320        .replace(['\r', '\u{0c}'], "\n")
1321        .lines()
1322        .map(str::trim_end)
1323        .collect::<Vec<_>>()
1324        .join("\n")
1325}
1326
1327fn looks_like_gzip(bytes: &[u8]) -> bool {
1328    bytes.starts_with(&[0x1f, 0x8b])
1329}
1330
1331fn looks_like_bzip2(bytes: &[u8]) -> bool {
1332    bytes.starts_with(b"BZh")
1333}
1334
1335fn looks_like_xz(bytes: &[u8]) -> bool {
1336    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1337}
1338
1339fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1340    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1341}
1342
1343fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1344    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1345}
1346
1347fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1348    lower_extension(path)
1349        .as_deref()
1350        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1351        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1352            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1353}
1354
1355fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1356    if looks_like_deb(bytes, path) {
1357        "debian binary package (format 2.0)".to_string()
1358    } else if looks_like_rpm(bytes, path) {
1359        "RPM package".to_string()
1360    } else if looks_like_squashfs(bytes, path) {
1361        "Squashfs filesystem".to_string()
1362    } else if looks_like_gzip(bytes) {
1363        "gzip compressed data".to_string()
1364    } else if looks_like_bzip2(bytes) {
1365        "bzip2 compressed data".to_string()
1366    } else if looks_like_xz(bytes) {
1367        "XZ compressed data".to_string()
1368    } else if is_zip_archive(bytes) {
1369        "Zip archive data".to_string()
1370    } else if lower_extension(path).as_deref() == Some("gem") {
1371        "POSIX tar archive".to_string()
1372    } else if let Some(file_type) = format_based_file_type(detected_format) {
1373        file_type
1374    } else {
1375        "archive data".to_string()
1376    }
1377}
1378
1379fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1380    let suffix = text_executable_label(bytes);
1381
1382    match programming_language {
1383        Some("Python") => format!("python script, {suffix}"),
1384        Some("Ruby") => format!("ruby script, {suffix}"),
1385        Some("Perl") => format!("perl script, {suffix}"),
1386        Some("PHP") => format!("php script, {suffix}"),
1387        Some("Shell") => format!("shell script, {suffix}"),
1388        Some("Bash") => format!("bash script, {suffix}"),
1389        Some("Zsh") => format!("zsh script, {suffix}"),
1390        Some("Fish") => format!("fish script, {suffix}"),
1391        Some("Ksh") => format!("ksh script, {suffix}"),
1392        Some("JavaScript") => format!("javascript script, {suffix}"),
1393        Some("TypeScript") => format!("typescript script, {suffix}"),
1394        Some("PowerShell") => format!("powershell script, {suffix}"),
1395        Some("Awk") => format!("awk script, {suffix}"),
1396        _ => format!("script, {suffix}"),
1397    }
1398}
1399
1400fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1401    let suffix = text_label(bytes);
1402    match programming_language {
1403        Some("C") => format!("C source, {suffix}"),
1404        Some("C++") => format!("C++ source, {suffix}"),
1405        Some("Java") => format!("Java source, {suffix}"),
1406        Some("C#") => format!("C# source, {suffix}"),
1407        Some("F#") => format!("F# source, {suffix}"),
1408        Some("Go") => format!("Go source, {suffix}"),
1409        Some("Rust") => format!("Rust source, {suffix}"),
1410        Some("Starlark") => format!("Starlark source, {suffix}"),
1411        Some("CMake") => format!("CMake source, {suffix}"),
1412        Some("Meson") => format!("Meson source, {suffix}"),
1413        Some("Nix") => format!("Nix source, {suffix}"),
1414        Some("Groovy") => format!("Groovy source, {suffix}"),
1415        Some("Makefile") => format!("Makefile source, {suffix}"),
1416        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1417        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1418        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1419        Some(language) => format!("{language} source, {suffix}"),
1420        None => text_file_type(bytes),
1421    }
1422}
1423
1424fn text_file_type(bytes: &[u8]) -> String {
1425    text_label(bytes).to_string()
1426}
1427
1428fn text_label(bytes: &[u8]) -> &'static str {
1429    if std::str::from_utf8(bytes).is_ok() {
1430        if bytes.contains(&b'\n') {
1431            "UTF-8 Unicode text"
1432        } else {
1433            "UTF-8 Unicode text, with no line terminators"
1434        }
1435    } else if bytes.contains(&b'\n') {
1436        "text"
1437    } else {
1438        "text, with no line terminators"
1439    }
1440}
1441
1442fn text_executable_label(bytes: &[u8]) -> &'static str {
1443    if std::str::from_utf8(bytes).is_ok() {
1444        if bytes.contains(&b'\n') {
1445            "UTF-8 Unicode text executable"
1446        } else {
1447            "UTF-8 Unicode text executable, with no line terminators"
1448        }
1449    } else if bytes.contains(&b'\n') {
1450        "text executable"
1451    } else {
1452        "text executable, with no line terminators"
1453    }
1454}
1455
1456fn supported_image_metadata_format(
1457    ext: Option<&str>,
1458    detected_format: FileFormat,
1459) -> Option<ImageFormat> {
1460    match ext {
1461        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1462        Some("png") => Some(ImageFormat::Png),
1463        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1464        Some("webp") => Some(ImageFormat::WebP),
1465        _ => match detected_format.media_type() {
1466            "image/jpeg" => Some(ImageFormat::Jpeg),
1467            "image/png" => Some(ImageFormat::Png),
1468            "image/tiff" => Some(ImageFormat::Tiff),
1469            "image/webp" => Some(ImageFormat::WebP),
1470            _ => None,
1471        },
1472    }
1473}
1474
1475fn should_skip_binary_string_extraction(
1476    path: &Path,
1477    bytes: &[u8],
1478    detected_format: FileFormat,
1479) -> bool {
1480    matches!(lower_extension(path).as_deref(), Some("pdf"))
1481        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1482            .is_some()
1483        || (matches!(
1484            detected_format.kind(),
1485            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1486        ) && !is_textual_format(detected_format))
1487        || media_mime_from_content(bytes).is_some()
1488        || is_zip_archive(bytes)
1489        || looks_like_gzip(bytes)
1490        || looks_like_bzip2(bytes)
1491        || looks_like_xz(bytes)
1492        || looks_like_deb(bytes, path)
1493        || looks_like_rpm(bytes, path)
1494        || looks_like_squashfs(bytes, path)
1495}
1496
1497fn should_skip_large_opaque_binary_text_extraction(
1498    _path: &Path,
1499    bytes: &[u8],
1500    detected_format: FileFormat,
1501) -> bool {
1502    is_large_opaque_binary_candidate(bytes, detected_format)
1503        && !sample_has_promising_printable_strings(bytes)
1504}
1505
1506fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1507    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1508        && !is_textual_format(detected_format)
1509        && !matches!(
1510            detected_format.kind(),
1511            FileFormatKind::Archive
1512                | FileFormatKind::Compressed
1513                | FileFormatKind::Package
1514                | FileFormatKind::Audio
1515                | FileFormatKind::Image
1516                | FileFormatKind::Video
1517        )
1518}
1519
1520fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1521    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1522
1523    let mut ranges = Vec::new();
1524    let mut push_range = |start: usize, end: usize| {
1525        if start < end && !ranges.contains(&(start, end)) {
1526            ranges.push((start, end));
1527        }
1528    };
1529
1530    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1531    if len > SAMPLE_WINDOW_BYTES * 2 {
1532        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1533        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1534        push_range(mid_start, mid_end);
1535    }
1536    if len > SAMPLE_WINDOW_BYTES {
1537        push_range(len - SAMPLE_WINDOW_BYTES, len);
1538    }
1539
1540    ranges
1541}
1542
1543fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1544    let mut structured_signal_seen = false;
1545    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1546        .into_iter()
1547        .filter(|&(start, end)| {
1548            let window = &bytes[start..end];
1549            if has_strong_structured_text_signal(window) {
1550                structured_signal_seen = true;
1551            }
1552            has_license_or_notice_signal(window)
1553        })
1554        .count();
1555
1556    structured_signal_seen || promising_license_windows >= 2
1557}
1558
1559fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1560    let mut combined_lines = BTreeSet::new();
1561
1562    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1563        let window_text = extract_printable_strings(&bytes[start..end]);
1564        for line in window_text
1565            .lines()
1566            .map(str::trim)
1567            .filter(|line| !line.is_empty())
1568        {
1569            combined_lines.insert(line.to_string());
1570        }
1571    }
1572
1573    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1574}
1575
1576fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1577    let strings = extract_printable_strings(bytes);
1578    if strings.is_empty() {
1579        return false;
1580    }
1581
1582    let lower = strings.to_ascii_lowercase();
1583    [
1584        "copyright",
1585        "license",
1586        "licensed under",
1587        "all rights reserved",
1588        "permission is hereby granted",
1589        "redistribution and use",
1590        "spdx-license-identifier",
1591    ]
1592    .iter()
1593    .any(|marker| lower.contains(marker))
1594}
1595
1596fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1597    let strings = extract_printable_strings(bytes);
1598    if strings.is_empty() {
1599        return false;
1600    }
1601
1602    let email_markers = strings.matches('@').count();
1603    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1604
1605    email_markers + url_markers >= 3
1606}
1607
1608fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1609    match format {
1610        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1611        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1612        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1613        ImageFormat::WebP => {
1614            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1615        }
1616        _ => false,
1617    }
1618}
1619
1620fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1621    let mut values = Vec::new();
1622    values.extend(extract_exif_metadata_values(bytes));
1623    values.extend(extract_xmp_metadata_values(bytes, format));
1624    values_to_text(values)
1625}
1626
1627fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1628    let mut cursor = BufReader::new(Cursor::new(bytes));
1629    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1630        Ok(exif) => exif,
1631        Err(_) => return Vec::new(),
1632    };
1633
1634    let mut values = Vec::new();
1635    for field in exif.fields() {
1636        let rendered = match field.tag {
1637            exif::Tag::ImageDescription => Some(format_metadata_field(
1638                "Description",
1639                &field.display_value().with_unit(&exif).to_string(),
1640            )),
1641            exif::Tag::Copyright => Some(format_metadata_field(
1642                "Copyright",
1643                &field.display_value().with_unit(&exif).to_string(),
1644            )),
1645            exif::Tag::UserComment => Some(format_metadata_field(
1646                "Comment",
1647                &field.display_value().with_unit(&exif).to_string(),
1648            )),
1649            exif::Tag::Artist => Some(format_metadata_field(
1650                "Author",
1651                &field.display_value().with_unit(&exif).to_string(),
1652            )),
1653            _ => None,
1654        };
1655
1656        if let Some(rendered) = rendered {
1657            values.push(rendered);
1658        }
1659    }
1660
1661    values
1662}
1663
1664fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1665    let xmp = match extract_raw_xmp_packet(bytes, format) {
1666        Some(xmp) => xmp,
1667        None => return Vec::new(),
1668    };
1669
1670    parse_xmp_values(&xmp)
1671}
1672
1673fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1674    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1675    if let Ok(mut decoder) = reader.into_decoder()
1676        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1677    {
1678        return Some(xmp);
1679    }
1680
1681    match format {
1682        ImageFormat::Png => extract_png_xmp_packet(bytes),
1683        _ => None,
1684    }
1685}
1686
1687fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1688    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1689
1690    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1691        return None;
1692    }
1693
1694    let mut offset = PNG_SIGNATURE.len();
1695    while offset + 12 <= bytes.len() {
1696        let length = u32::from_be_bytes([
1697            bytes[offset],
1698            bytes[offset + 1],
1699            bytes[offset + 2],
1700            bytes[offset + 3],
1701        ]) as usize;
1702        let chunk_start = offset + 8;
1703        let chunk_end = chunk_start + length;
1704        if chunk_end + 4 > bytes.len() {
1705            return None;
1706        }
1707
1708        let chunk_type = &bytes[offset + 4..offset + 8];
1709        if chunk_type == b"iTXt" {
1710            let data = &bytes[chunk_start..chunk_end];
1711            if let Some(xmp) = parse_png_itxt_xmp(data) {
1712                return Some(xmp);
1713            }
1714        }
1715
1716        offset = chunk_end + 4;
1717    }
1718
1719    None
1720}
1721
1722fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1723    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1724
1725    let keyword_end = data.iter().position(|&b| b == 0)?;
1726    if &data[..keyword_end] != XMP_KEYWORD {
1727        return None;
1728    }
1729
1730    let mut cursor = keyword_end + 1;
1731    let compression_flag = *data.get(cursor)?;
1732    cursor += 1;
1733    let compression_method = *data.get(cursor)?;
1734    cursor += 1;
1735    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1736        return None;
1737    }
1738
1739    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1740    cursor = language_end + 1;
1741
1742    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1743    cursor = translated_end + 1;
1744
1745    let text_bytes = &data[cursor..];
1746    if compression_flag == 1 {
1747        let mut decoder = ZlibDecoder::new(text_bytes);
1748        let mut decoded = Vec::new();
1749        decoder.read_to_end(&mut decoded).ok()?;
1750        Some(decoded)
1751    } else {
1752        Some(text_bytes.to_vec())
1753    }
1754}
1755
1756fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1757    let mut reader = XmlReader::from_reader(xmp);
1758    reader.config_mut().trim_text(true);
1759
1760    let mut buf = Vec::new();
1761    let mut stack: Vec<String> = Vec::new();
1762    let mut values = Vec::new();
1763
1764    loop {
1765        match reader.read_event_into(&mut buf) {
1766            Ok(Event::Start(e)) => {
1767                stack.push(local_xml_name(e.name().as_ref()));
1768            }
1769            Ok(Event::End(_)) => {
1770                stack.pop();
1771            }
1772            Ok(Event::Empty(_)) => {}
1773            Ok(Event::Text(text)) => {
1774                if let Some(field) = stack
1775                    .iter()
1776                    .rev()
1777                    .find_map(|name| allowed_xmp_field(name.as_str()))
1778                    && let Ok(decoded) = text.decode()
1779                {
1780                    let decoded = decoded.into_owned();
1781                    if !decoded.trim().is_empty() {
1782                        values.push(format_xmp_value(field, &decoded));
1783                    }
1784                }
1785            }
1786            Ok(Event::CData(text)) => {
1787                if let Some(field) = stack
1788                    .iter()
1789                    .rev()
1790                    .find_map(|name| allowed_xmp_field(name.as_str()))
1791                    && let Ok(decoded) = text.decode()
1792                {
1793                    let decoded = decoded.into_owned();
1794                    if !decoded.trim().is_empty() {
1795                        values.push(format_xmp_value(field, &decoded));
1796                    }
1797                }
1798            }
1799            Ok(Event::Eof) | Err(_) => break,
1800            _ => {}
1801        }
1802        buf.clear();
1803    }
1804
1805    values
1806}
1807
1808fn local_xml_name(name: &[u8]) -> String {
1809    let name = std::str::from_utf8(name).unwrap_or_default();
1810    name.rsplit(':').next().unwrap_or(name).to_string()
1811}
1812
1813fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1814    match name {
1815        "creator" => Some("creator"),
1816        "rights" => Some("rights"),
1817        "description" => Some("description"),
1818        "title" => Some("title"),
1819        "subject" => Some("subject"),
1820        "UsageTerms" => Some("usage_terms"),
1821        "WebStatement" => Some("web_statement"),
1822        _ => None,
1823    }
1824}
1825
1826fn format_xmp_value(field: &str, value: &str) -> String {
1827    match field {
1828        "creator" => format_metadata_field("Author", value),
1829        "rights" => format_metadata_field("Copyright", value),
1830        "description" => format_metadata_field("Description", value),
1831        "title" => format_metadata_field("Title", value),
1832        "subject" => format_metadata_field("Subject", value),
1833        "usage_terms" => format_metadata_field("UsageTerms", value),
1834        "web_statement" => format_metadata_field("WebStatement", value),
1835        _ => value.to_string(),
1836    }
1837}
1838
1839fn format_metadata_field(label: &str, value: &str) -> String {
1840    format!("{label}: {value}")
1841}
1842
1843fn values_to_text(values: Vec<String>) -> String {
1844    let mut seen = BTreeSet::new();
1845    let mut normalized_lines = Vec::new();
1846
1847    for value in values {
1848        let normalized = normalize_metadata_value(&value);
1849        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1850            continue;
1851        }
1852
1853        normalized_lines.push(normalized);
1854    }
1855
1856    let author_values: BTreeSet<String> = normalized_lines
1857        .iter()
1858        .filter_map(|line| split_metadata_field(line))
1859        .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
1860        .map(|(_, value)| value.to_string())
1861        .collect();
1862
1863    let mut lines = Vec::new();
1864    let mut total_bytes = 0usize;
1865
1866    for normalized in normalized_lines {
1867        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1868            break;
1869        }
1870
1871        if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
1872            continue;
1873        }
1874
1875        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1876        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1877            break;
1878        }
1879
1880        total_bytes += added_bytes;
1881        lines.push(normalized);
1882    }
1883
1884    lines.join("\n")
1885}
1886
1887fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
1888    let (label, value) = line.split_once(':')?;
1889    Some((label.trim(), value.trim()))
1890}
1891
1892fn should_suppress_bare_copyright_metadata_line(
1893    line: &str,
1894    author_values: &BTreeSet<String>,
1895) -> bool {
1896    let Some((label, value)) = split_metadata_field(line) else {
1897        return false;
1898    };
1899    if !label.eq_ignore_ascii_case("Copyright")
1900        || value.is_empty()
1901        || !author_values.contains(value)
1902    {
1903        return false;
1904    }
1905
1906    let lower = value.to_ascii_lowercase();
1907    !lower.contains("copyright")
1908        && !lower.contains("(c)")
1909        && !lower.contains('©')
1910        && !lower.contains("all rights")
1911        && !value.chars().any(|ch| ch.is_ascii_digit())
1912}
1913
1914fn normalize_metadata_value(value: &str) -> String {
1915    value
1916        .chars()
1917        .filter(|&ch| ch != '\0')
1918        .collect::<String>()
1919        .split_whitespace()
1920        .collect::<Vec<_>>()
1921        .join(" ")
1922        .trim()
1923        .to_string()
1924}
1925
1926fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1927    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1928        return (String::new(), None);
1929    }
1930
1931    let mut failures = Vec::new();
1932    let mut saw_success = false;
1933
1934    let extracted = catch_unwind(AssertUnwindSafe(
1935        || -> Result<String, Box<dyn std::error::Error>> {
1936            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1937            extract_first_pdf_page_text(&mut document)
1938        },
1939    ));
1940    match extracted {
1941        Ok(Ok(text)) => {
1942            saw_success = true;
1943            if let Some(normalized) = normalize_pdf_text(text) {
1944                return (normalized, None);
1945            }
1946        }
1947        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1948        Err(payload) => failures.push(format!(
1949            "from-bytes first-page panic: {}",
1950            panic_payload_to_string(payload.as_ref())
1951        )),
1952    }
1953
1954    let extracted = catch_unwind(AssertUnwindSafe(
1955        || -> Result<String, Box<dyn std::error::Error>> {
1956            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1957            extract_pdf_text_from_document(&mut document)
1958        },
1959    ));
1960    match extracted {
1961        Ok(Ok(text)) => {
1962            saw_success = true;
1963            if let Some(normalized) = normalize_pdf_text(text) {
1964                return (normalized, None);
1965            }
1966        }
1967        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1968        Err(payload) => failures.push(format!(
1969            "open full-document panic: {}",
1970            panic_payload_to_string(payload.as_ref())
1971        )),
1972    }
1973
1974    let extracted = catch_unwind(AssertUnwindSafe(
1975        || -> Result<String, Box<dyn std::error::Error>> {
1976            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1977            extract_pdf_text_from_document(&mut document)
1978        },
1979    ));
1980    match extracted {
1981        Ok(Ok(text)) => {
1982            saw_success = true;
1983            if let Some(normalized) = normalize_pdf_text(text) {
1984                return (normalized, None);
1985            }
1986        }
1987        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1988        Err(payload) => failures.push(format!(
1989            "from-bytes full-document panic: {}",
1990            panic_payload_to_string(payload.as_ref())
1991        )),
1992    }
1993
1994    if saw_success || is_non_actionable_pdf_failure(&failures) {
1995        (String::new(), None)
1996    } else {
1997        (
1998            String::new(),
1999            Some(format!(
2000                "PDF text extraction failed after {} attempts: {}",
2001                failures.len(),
2002                failures.join("; ")
2003            )),
2004        )
2005    }
2006}
2007
2008fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2009    !failures.is_empty()
2010        && failures.iter().all(|failure| {
2011            failure.contains("requires a password")
2012                || failure.contains("Encrypt dictionary missing /O")
2013                || failure.contains("Encrypt dictionary missing /U")
2014                || failure.contains("security handler cannot be found")
2015                || failure.contains("Invalid cross-reference table")
2016        })
2017}
2018
2019fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2020    if let Some(message) = payload.downcast_ref::<&str>() {
2021        (*message).to_string()
2022    } else if let Some(message) = payload.downcast_ref::<String>() {
2023        message.clone()
2024    } else {
2025        "unknown panic payload".to_string()
2026    }
2027}
2028
2029fn extract_first_pdf_page_text(
2030    document: &mut pdf_oxide::document::PdfDocument,
2031) -> Result<String, Box<dyn std::error::Error>> {
2032    if document.page_count()? == 0 {
2033        return Ok(String::new());
2034    }
2035
2036    let extracted_text = document.extract_text(0)?;
2037    let markdown_text =
2038        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2039    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2040        return Ok(extracted_text);
2041    }
2042
2043    let pipeline_text =
2044        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2045
2046    Ok(merge_pdf_first_page_text(
2047        &extracted_text,
2048        &markdown_text,
2049        &pipeline_text,
2050    ))
2051}
2052
2053fn extract_pdf_text_from_document(
2054    document: &mut pdf_oxide::document::PdfDocument,
2055) -> Result<String, Box<dyn std::error::Error>> {
2056    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2057}
2058
2059fn normalize_pdf_text(text: String) -> Option<String> {
2060    let normalized = text.replace(['\r', '\u{0c}'], "\n");
2061    (!normalized.trim().is_empty()).then_some(normalized)
2062}
2063
2064fn merge_pdf_first_page_text(
2065    _extracted_text: &str,
2066    markdown_text: &str,
2067    pipeline_text: &str,
2068) -> String {
2069    let pipeline = pipeline_text.trim();
2070    if pipeline.is_empty() {
2071        return String::new();
2072    }
2073
2074    let prefix = pdf_first_page_heading_prefix(markdown_text);
2075    let Some(prefix) = prefix else {
2076        return pipeline_text.to_string();
2077    };
2078
2079    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2080        pipeline_text.to_string()
2081    } else {
2082        format!("{prefix}\n\n{pipeline}")
2083    }
2084}
2085
2086fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2087    normalize_pdf_heading_comparison_text(text)
2088        .contains(&normalize_pdf_heading_comparison_text(prefix))
2089}
2090
2091fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2092    text.split_whitespace()
2093        .map(|part| part.to_ascii_lowercase())
2094        .collect::<Vec<_>>()
2095        .join(" ")
2096}
2097
2098fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2099    let mut lines = Vec::new();
2100
2101    for line in pdf_markdown_heading_lines(markdown_text) {
2102        push_unique_line(&mut lines, line);
2103    }
2104
2105    (!lines.is_empty()).then(|| lines.join("\n"))
2106}
2107
2108fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2109    text.lines()
2110        .map(str::trim)
2111        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2112        .map(|line| line.trim_matches('#').trim())
2113        .filter(|line| !line.is_empty())
2114        .filter(|line| !looks_like_numbered_section_heading(line))
2115        .take(4)
2116        .map(ToOwned::to_owned)
2117        .collect()
2118}
2119
2120fn push_unique_line(lines: &mut Vec<String>, line: String) {
2121    if !lines.iter().any(|existing| existing == &line) {
2122        lines.push(line);
2123    }
2124}
2125
2126fn looks_like_numbered_section_heading(line: &str) -> bool {
2127    let mut chars = line.chars();
2128    let Some(first) = chars.next() else {
2129        return false;
2130    };
2131
2132    if !first.is_ascii_digit() {
2133        return false;
2134    }
2135
2136    matches!(chars.next(), Some('.'))
2137}
2138
2139fn is_zip_archive(bytes: &[u8]) -> bool {
2140    bytes.starts_with(b"PK\x03\x04")
2141        || bytes.starts_with(b"PK\x05\x06")
2142        || bytes.starts_with(b"PK\x07\x08")
2143}
2144
2145pub fn extract_printable_strings(bytes: &[u8]) -> String {
2146    const MIN_LEN: usize = 4;
2147    const MIN_OUTPUT_BYTES: usize = 2_000_000;
2148    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2149
2150    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2151
2152    fn is_printable_ascii(b: u8) -> bool {
2153        matches!(b, 0x20..=0x7E)
2154    }
2155
2156    let mut out = String::new();
2157    let mut run: Vec<u8> = Vec::new();
2158
2159    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2160        if run.len() >= MIN_LEN {
2161            if !out.is_empty() {
2162                out.push('\n');
2163            }
2164            out.push_str(&String::from_utf8_lossy(run));
2165        }
2166        run.clear();
2167    };
2168
2169    for &b in bytes {
2170        if is_printable_ascii(b) {
2171            run.push(b);
2172        } else {
2173            flush_run(&mut out, &mut run);
2174            if out.len() >= max_output_bytes {
2175                return out;
2176            }
2177        }
2178    }
2179    flush_run(&mut out, &mut run);
2180    if out.len() >= max_output_bytes {
2181        return out;
2182    }
2183
2184    for start in 0..=1 {
2185        run.clear();
2186        let mut i = start;
2187        while i + 1 < bytes.len() {
2188            let b0 = bytes[i];
2189            let b1 = bytes[i + 1];
2190            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2191            if is_printable_ascii(ch) && zero == 0 {
2192                run.push(ch);
2193            } else {
2194                flush_run(&mut out, &mut run);
2195                if out.len() >= max_output_bytes {
2196                    return out;
2197                }
2198            }
2199            i += 2;
2200        }
2201        flush_run(&mut out, &mut run);
2202        if out.len() >= max_output_bytes {
2203            return out;
2204        }
2205    }
2206
2207    out
2208}
2209
2210#[cfg(test)]
2211mod tests {
2212    use std::path::Path;
2213
2214    use crate::copyright::detect_copyrights;
2215
2216    use super::{
2217        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2218        extract_printable_strings, extract_text_for_detection,
2219        extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2220        is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2221        values_to_text, windows_metadata_or_empty_result,
2222    };
2223
2224    fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2225        let mut out = Vec::new();
2226        out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2227        out.extend_from_slice(chunk_type);
2228        out.extend_from_slice(data);
2229        out.extend_from_slice(&0u32.to_be_bytes());
2230        out
2231    }
2232
2233    fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2234        let mut bytes = Vec::new();
2235        bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2236
2237        let ihdr = [
2238            0, 0, 0, 1, // width
2239            0, 0, 0, 1, // height
2240            8, // bit depth
2241            2, // color type
2242            0, // compression
2243            0, // filter
2244            0, // interlace
2245        ];
2246        bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2247
2248        let mut itxt = Vec::new();
2249        itxt.extend_from_slice(b"XML:com.adobe.xmp");
2250        itxt.push(0); // keyword terminator
2251        itxt.push(0); // compression flag
2252        itxt.push(0); // compression method
2253        itxt.push(0); // language tag terminator
2254        itxt.push(0); // translated keyword terminator
2255        itxt.extend_from_slice(xmp.as_bytes());
2256        bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2257
2258        bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2259        bytes
2260    }
2261
2262    #[test]
2263    fn test_extract_text_for_detection_skips_jar_archives() {
2264        let path = Path::new(
2265            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2266        );
2267        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2268
2269        let (text, kind) = extract_text_for_detection(path, &bytes);
2270
2271        assert!(text.is_empty());
2272        assert_eq!(kind, ExtractedTextKind::None);
2273    }
2274
2275    #[test]
2276    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2277        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2278        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2279
2280        let (text, kind) = extract_text_for_detection(path, &bytes);
2281
2282        assert_eq!(kind, ExtractedTextKind::Pdf);
2283        assert!(text.contains("Redistribution and use in source and binary forms"));
2284    }
2285
2286    #[test]
2287    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2288        let path =
2289            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2290        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2291
2292        let (text, kind) = extract_text_for_detection(path, &bytes);
2293
2294        assert_eq!(kind, ExtractedTextKind::Pdf);
2295        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2296        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2297    }
2298
2299    #[test]
2300    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2301        let path =
2302            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2303        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2304
2305        let (text, kind) = extract_text_for_detection(path, &bytes);
2306
2307        assert_eq!(kind, ExtractedTextKind::Pdf);
2308
2309        let normalized = normalize_pdf_heading_comparison_text(&text);
2310        let heading =
2311            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2312        assert_eq!(normalized.matches(&heading).count(), 1);
2313    }
2314
2315    #[test]
2316    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2317        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2318        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2319
2320        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2321
2322        assert_eq!(kind, ExtractedTextKind::Pdf);
2323        assert!(text.contains("Redistribution and use in source and binary forms"));
2324    }
2325
2326    #[test]
2327    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2328        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2329
2330        let (text, kind, scan_error) =
2331            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2332
2333        assert!(text.is_empty());
2334        assert_eq!(kind, ExtractedTextKind::None);
2335        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2336        assert!(scan_error.contains("PDF text extraction failed after"));
2337    }
2338
2339    #[test]
2340    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2341        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2342
2343        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2344
2345        assert!(text.is_empty());
2346        assert_eq!(kind, ExtractedTextKind::None);
2347    }
2348
2349    #[test]
2350    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2351        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2352        let text = b"Copyright 2026 Example Project!!!";
2353        bytes[..text.len()].copy_from_slice(text);
2354        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2355        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2356
2357        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2358
2359        assert_ne!(kind, ExtractedTextKind::None);
2360        assert!(text.contains("Copyright 2026 Example Project"));
2361    }
2362
2363    #[test]
2364    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2365        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2366        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2367        bytes[..noise.len()].copy_from_slice(noise);
2368        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2369        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2370
2371        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2372
2373        assert!(text.is_empty());
2374        assert_eq!(kind, ExtractedTextKind::None);
2375    }
2376
2377    #[test]
2378    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2379        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2380        let bytes = std::fs::read(path).expect("read PE fixture");
2381
2382        let (text, kind) = extract_text_for_detection(path, &bytes);
2383
2384        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2385        assert!(text.contains("License: This program is free software"));
2386        assert!(text.contains("LegalCopyright:"));
2387    }
2388
2389    #[test]
2390    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2391    {
2392        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2393        let mut bytes = std::fs::read(path).expect("read PE fixture");
2394        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2395
2396        let (text, kind) = extract_text_for_detection(path, &bytes);
2397
2398        assert_ne!(kind, ExtractedTextKind::None);
2399        assert!(!text.trim().is_empty());
2400    }
2401
2402    #[test]
2403    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2404        let (text, kind, scan_error) =
2405            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2406
2407        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2408        assert_eq!(text, "LegalCopyright: Example Corp");
2409        assert!(scan_error.is_none());
2410    }
2411
2412    #[test]
2413    fn test_format_xmp_value_labels_creator_and_title_fields() {
2414        assert_eq!(
2415            format_xmp_value("creator", "Chinmay Garde"),
2416            "Author: Chinmay Garde"
2417        );
2418        assert_eq!(
2419            format_xmp_value("title", "Bay Bridge At Night"),
2420            "Title: Bay Bridge At Night"
2421        );
2422        assert_eq!(
2423            format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2424            "Description: Embarcadero in the evening on Delta 3200"
2425        );
2426    }
2427
2428    #[test]
2429    fn test_format_metadata_field_prefixes_exif_text() {
2430        assert_eq!(
2431            format_metadata_field("Author", "Chinmay Garde"),
2432            "Author: Chinmay Garde"
2433        );
2434        assert_eq!(
2435            format_metadata_field("Description", "Bay Bridge At Night"),
2436            "Description: Bay Bridge At Night"
2437        );
2438    }
2439
2440    #[test]
2441    fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2442        let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2443        let bytes = build_png_with_xmp(xmp);
2444
2445        let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2446
2447        assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2448        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2449        assert!(
2450            text.contains("Title: Bay Bridge At Night"),
2451            "text: {text:?}"
2452        );
2453        assert!(
2454            text.contains("Description: Embarcadero in the evening on Delta 3200"),
2455            "text: {text:?}"
2456        );
2457
2458        let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2459        assert_eq!(
2460            authors
2461                .iter()
2462                .map(|a| a.author.as_str())
2463                .collect::<Vec<_>>(),
2464            vec!["Chinmay Garde"],
2465            "authors: {authors:?}; text: {text:?}"
2466        );
2467    }
2468
2469    #[test]
2470    fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2471        let text = values_to_text(vec![
2472            "Author: Chinmay Garde".to_string(),
2473            "Copyright: Chinmay Garde".to_string(),
2474            "Title: Bay Bridge At Night".to_string(),
2475        ]);
2476
2477        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2478        assert!(
2479            text.contains("Title: Bay Bridge At Night"),
2480            "text: {text:?}"
2481        );
2482        assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2483    }
2484
2485    #[test]
2486    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2487        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2488        let text = b"Copyright 2026 Example Project!!!";
2489        bytes[..text.len()].copy_from_slice(text);
2490
2491        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2492
2493        assert!(text.is_empty());
2494        assert_eq!(kind, ExtractedTextKind::None);
2495    }
2496
2497    #[test]
2498    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2499        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2500        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2501        bytes[..text.len()].copy_from_slice(text);
2502
2503        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2504
2505        assert_ne!(kind, ExtractedTextKind::None);
2506        assert!(text.contains("asn@redhat.com"));
2507        assert!(text.contains("https://publicsuffix.org/"));
2508    }
2509
2510    #[test]
2511    fn test_non_actionable_pdf_failures_are_suppressed() {
2512        assert!(is_non_actionable_pdf_failure(&[
2513            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2514            "open full-document: PDF is encrypted and requires a password".to_string(),
2515        ]));
2516        assert!(is_non_actionable_pdf_failure(&[
2517            "from-bytes first-page: Invalid cross-reference table".to_string(),
2518            "open full-document: Invalid cross-reference table".to_string(),
2519        ]));
2520        assert!(is_non_actionable_pdf_failure(&[
2521            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2522            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2523        ]));
2524        assert!(!is_non_actionable_pdf_failure(&[
2525            "from-bytes first-page: some other parser failure".to_string(),
2526        ]));
2527    }
2528
2529    #[test]
2530    fn test_extract_text_for_detection_skips_zip_like_archives() {
2531        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2532
2533        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2534        let (crate_text, crate_kind) =
2535            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2536
2537        assert!(whl_text.is_empty());
2538        assert_eq!(whl_kind, ExtractedTextKind::None);
2539        assert!(crate_text.is_empty());
2540        assert_eq!(crate_kind, ExtractedTextKind::None);
2541    }
2542
2543    #[test]
2544    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2545        let path =
2546            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2547        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2548
2549        let (text, kind) = extract_text_for_detection(path, &bytes);
2550
2551        assert_ne!(kind, ExtractedTextKind::None);
2552        assert!(text.contains("Copyright nexB and others (c) 2012"));
2553    }
2554
2555    #[test]
2556    fn test_extract_text_for_detection_reads_font_metadata() {
2557        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2558        let bytes = std::fs::read(path).expect("failed to read font fixture");
2559
2560        let (text, kind) = extract_text_for_detection(path, &bytes);
2561
2562        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2563        assert!(text.contains("License Description:"), "{text}");
2564        assert!(
2565            text.contains("Open Font License") || text.contains("OFL"),
2566            "{text}"
2567        );
2568        assert!(text.contains("Lato"), "{text}");
2569    }
2570
2571    #[test]
2572    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2573        let bytes = b"abcd\0".repeat(525_000);
2574
2575        let text = extract_printable_strings(&bytes);
2576
2577        assert!(
2578            text.len() > 2_000_000,
2579            "unexpected truncation at {}",
2580            text.len()
2581        );
2582        assert!(text.ends_with("abcd"));
2583    }
2584
2585    #[test]
2586    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2587        let path = Path::new(
2588            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2589        );
2590        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2591
2592        let (text, kind) = extract_text_for_detection(path, &bytes);
2593
2594        assert_eq!(kind, ExtractedTextKind::Decoded);
2595        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2596    }
2597
2598    #[test]
2599    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2600        let path = Path::new(
2601            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2602        );
2603        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2604
2605        let (text, kind) = extract_text_for_detection(path, &bytes);
2606
2607        assert_eq!(kind, ExtractedTextKind::Decoded);
2608        assert!(text.contains("GNU Lesser General Public"));
2609        assert!(text.contains("version"));
2610        assert!(text.contains("2.1 of the License"));
2611    }
2612
2613    #[test]
2614    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2615        assert_eq!(
2616            normalize_mime_type(
2617                Path::new("main.ts"),
2618                b"export const answer = 42;\n",
2619                Some("TypeScript"),
2620                "video/mp2t",
2621            ),
2622            "text/plain"
2623        );
2624    }
2625
2626    #[test]
2627    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2628        assert_eq!(
2629            normalize_mime_type(
2630                Path::new("main.js"),
2631                b"console.log('hello');\n",
2632                Some("JavaScript"),
2633                "application/octet-stream",
2634            ),
2635            "text/plain"
2636        );
2637    }
2638
2639    #[test]
2640    fn test_normalize_mime_type_preserves_binary_video_guess() {
2641        assert_eq!(
2642            normalize_mime_type(
2643                Path::new("main.ts"),
2644                &[0, 159, 146, 150, 0, 1, 2, 3],
2645                Some("TypeScript"),
2646                "video/mp2t",
2647            ),
2648            "video/mp2t"
2649        );
2650    }
2651
2652    #[test]
2653    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2654        assert_eq!(
2655            normalize_mime_type(
2656                Path::new("main.ts"),
2657                &[0, 159, 146, 150],
2658                Some("TypeScript"),
2659                "application/octet-stream",
2660            ),
2661            "application/octet-stream"
2662        );
2663    }
2664
2665    #[test]
2666    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2667        let classification = classify_file_info(Path::new("test.txt"), b"");
2668
2669        assert_eq!(classification.mime_type, "inode/x-empty");
2670        assert_eq!(classification.file_type, "empty");
2671        assert!(!classification.is_binary);
2672        assert!(classification.is_text);
2673        assert!(!classification.is_source);
2674        assert_eq!(classification.programming_language, None);
2675    }
2676
2677    #[test]
2678    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2679        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2680
2681        assert_eq!(classification.mime_type, "application/json");
2682        assert_eq!(classification.file_type, "JSON text data");
2683        assert!(classification.is_text);
2684        assert!(!classification.is_source);
2685        assert_eq!(classification.programming_language, None);
2686    }
2687
2688    #[test]
2689    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2690        let classification =
2691            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2692
2693        assert_eq!(classification.mime_type, "text/plain");
2694        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2695        assert!(classification.is_text);
2696        assert!(!classification.is_binary);
2697    }
2698
2699    #[test]
2700    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2701        let classification =
2702            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2703
2704        assert_eq!(classification.mime_type, "application/octet-stream");
2705        assert_eq!(classification.file_type, "data");
2706        assert!(classification.is_binary);
2707        assert!(!classification.is_text);
2708    }
2709
2710    #[test]
2711    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2712        let classification = classify_file_info(
2713            Path::new("utf16.json"),
2714            &[
2715                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2716            ],
2717        );
2718
2719        assert!(!classification.is_binary);
2720        assert!(classification.is_text);
2721        assert_eq!(classification.mime_type, "application/json");
2722        assert_eq!(classification.file_type, "JSON text data");
2723    }
2724
2725    #[test]
2726    fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
2727        let classification = classify_file_info(
2728            Path::new("utf16be.json"),
2729            &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
2730        );
2731
2732        assert!(!classification.is_binary);
2733        assert!(classification.is_text);
2734        assert_eq!(classification.mime_type, "application/json");
2735        assert_eq!(classification.file_type, "JSON text data");
2736    }
2737
2738    #[test]
2739    fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
2740        let classification =
2741            classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
2742
2743        assert!(!classification.is_binary);
2744        assert!(classification.is_text);
2745        assert_eq!(classification.mime_type, "application/json");
2746        assert_eq!(classification.file_type, "JSON text data");
2747    }
2748
2749    #[test]
2750    fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
2751        let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
2752        for code_unit in
2753            "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
2754        {
2755            bytes.extend_from_slice(&code_unit.to_be_bytes());
2756        }
2757
2758        let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
2759
2760        assert_eq!(kind, ExtractedTextKind::Decoded);
2761        assert!(text.contains("Apache Software Foundation"), "{text}");
2762        assert!(text.contains("Apache License, Version 2.0"), "{text}");
2763    }
2764
2765    #[test]
2766    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2767        let classification = classify_file_info(Path::new("true.json"), b"true");
2768
2769        assert!(!classification.is_binary);
2770        assert!(classification.is_text);
2771        assert_eq!(classification.mime_type, "application/json");
2772        assert_eq!(classification.file_type, "JSON text data");
2773    }
2774
2775    #[test]
2776    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2777        let classification = classify_file_info(
2778            Path::new("wrapped.json"),
2779            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2780        );
2781
2782        assert!(!classification.is_binary);
2783        assert!(classification.is_text);
2784        assert_eq!(classification.mime_type, "text/plain");
2785        assert_eq!(classification.file_type, "text, with no line terminators");
2786    }
2787
2788    #[test]
2789    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2790        let classification =
2791            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2792
2793        assert!(classification.is_binary);
2794        assert!(!classification.is_text);
2795        assert_eq!(classification.mime_type, "application/octet-stream");
2796        assert_eq!(classification.file_type, "data");
2797    }
2798
2799    #[test]
2800    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2801        let classification = classify_file_info(
2802            Path::new("crash.json"),
2803            &[
2804                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2805            ],
2806        );
2807
2808        assert!(classification.is_binary);
2809        assert!(!classification.is_text);
2810        assert_eq!(classification.mime_type, "application/octet-stream");
2811    }
2812
2813    #[test]
2814    fn test_classify_file_info_treats_dockerfile_as_source() {
2815        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2816
2817        assert_eq!(
2818            classification.programming_language.as_deref(),
2819            Some("Dockerfile")
2820        );
2821        assert!(classification.is_source);
2822        assert!(!classification.is_script);
2823        assert_eq!(
2824            classification.file_type,
2825            "Dockerfile source, UTF-8 Unicode text"
2826        );
2827    }
2828
2829    #[test]
2830    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2831        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2832
2833        assert_eq!(classification.programming_language, None);
2834        assert!(classification.is_text);
2835        assert!(!classification.is_source);
2836        assert!(!classification.is_script);
2837        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2838    }
2839
2840    #[test]
2841    fn test_classify_file_info_marks_supported_package_archives() {
2842        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2843
2844        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2845        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2846
2847        assert!(egg.is_archive);
2848        assert_eq!(egg.mime_type, "application/zip");
2849        assert_eq!(egg.file_type, "Zip archive data");
2850        assert!(nupkg.is_archive);
2851        assert_eq!(nupkg.mime_type, "application/zip");
2852        assert_eq!(nupkg.file_type, "Zip archive data");
2853    }
2854
2855    #[test]
2856    fn test_classify_file_info_marks_png_as_binary_media() {
2857        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2858
2859        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2860
2861        assert_eq!(classification.mime_type, "image/png");
2862        assert_eq!(classification.file_type, "PNG image data");
2863        assert!(classification.is_binary);
2864        assert!(!classification.is_text);
2865        assert!(classification.is_media);
2866        assert!(!classification.is_archive);
2867        assert!(!classification.is_source);
2868    }
2869
2870    #[test]
2871    fn test_classify_file_info_marks_pdf_as_binary_document() {
2872        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2873
2874        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2875
2876        assert_eq!(classification.mime_type, "application/pdf");
2877        assert_eq!(classification.file_type, "PDF document");
2878        assert!(classification.is_binary);
2879        assert!(!classification.is_text);
2880        assert!(!classification.is_archive);
2881        assert!(!classification.is_media);
2882    }
2883
2884    #[test]
2885    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2886        let classification =
2887            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2888
2889        assert!(classification.is_binary);
2890        assert!(!classification.is_text);
2891        assert!(!classification.is_source);
2892        assert_eq!(classification.programming_language, None);
2893    }
2894
2895    #[test]
2896    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2897        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2898
2899        assert_eq!(classification.programming_language, None);
2900        assert!(classification.is_text);
2901        assert!(!classification.is_source);
2902        assert_eq!(classification.file_type, "YAML text data");
2903    }
2904
2905    #[test]
2906    fn test_classify_file_info_classifies_common_build_manifests() {
2907        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2908        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2909        let cmake = classify_file_info(
2910            Path::new("toolchain.cmake"),
2911            b"set(CMAKE_CXX_STANDARD 20)\n",
2912        );
2913        let gitmodules = classify_file_info(
2914            Path::new(".gitmodules"),
2915            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2916        );
2917
2918        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2919        assert!(gradle.is_source);
2920        assert_eq!(gradle.mime_type, "text/plain");
2921        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2922
2923        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2924        assert!(flake.is_source);
2925        assert_eq!(flake.mime_type, "text/plain");
2926        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2927
2928        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2929        assert!(cmake.is_source);
2930        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2931
2932        assert_eq!(gitmodules.programming_language, None);
2933        assert!(gitmodules.is_text);
2934        assert!(!gitmodules.is_source);
2935        assert_eq!(gitmodules.file_type, "Git configuration text");
2936    }
2937
2938    #[test]
2939    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2940        let header = classify_file_info(
2941            Path::new("include/demo.hpp"),
2942            b"#pragma once\nclass Demo {};\n",
2943        );
2944        let ipp = classify_file_info(
2945            Path::new("include/detail/demo.ipp"),
2946            b"template <class T> void parse() {}\n",
2947        );
2948
2949        assert_eq!(header.programming_language.as_deref(), Some("C++"));
2950        assert!(header.is_source);
2951        assert!(!header.is_script);
2952        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2953
2954        assert_eq!(ipp.programming_language, None);
2955        assert!(!ipp.is_source);
2956        assert!(!ipp.is_script);
2957        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2958    }
2959
2960    #[test]
2961    fn test_classify_file_info_preserves_specific_shell_family_labels() {
2962        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2963
2964        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2965        assert!(bash.is_script);
2966        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2967    }
2968
2969    #[test]
2970    fn test_classify_file_info_marks_jamfile_as_source() {
2971        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2972
2973        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2974        assert!(jamfile.is_source);
2975        assert!(!jamfile.is_script);
2976        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2977    }
2978
2979    #[test]
2980    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2981        let classification = classify_file_info(
2982            Path::new("bin/run"),
2983            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2984        );
2985
2986        assert_eq!(
2987            classification.programming_language.as_deref(),
2988            Some("JavaScript")
2989        );
2990        assert!(classification.is_script);
2991        assert_eq!(
2992            classification.file_type,
2993            "javascript script, UTF-8 Unicode text executable"
2994        );
2995    }
2996
2997    #[test]
2998    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2999        let classification = classify_file_info(
3000            Path::new("script.py"),
3001            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3002        );
3003
3004        assert_eq!(
3005            classification.programming_language.as_deref(),
3006            Some("Python")
3007        );
3008        assert!(classification.is_script);
3009        assert_eq!(classification.file_type, "python script, text executable");
3010    }
3011
3012    #[test]
3013    fn test_classify_file_info_treats_textual_tga_as_media() {
3014        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3015
3016        assert!(classification.is_media);
3017        assert!(classification.is_text);
3018        assert!(!classification.is_binary);
3019    }
3020
3021    #[test]
3022    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3023        let classification =
3024            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3025
3026        assert!(classification.is_binary);
3027        assert!(!classification.is_text);
3028        assert!(!classification.is_source);
3029        assert_eq!(classification.programming_language, None);
3030    }
3031
3032    #[test]
3033    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3034        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3035
3036        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3037
3038        assert!(text.is_empty());
3039        assert_eq!(kind, ExtractedTextKind::None);
3040    }
3041
3042    #[test]
3043    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3044        let cases = [
3045            (
3046                Path::new("bin/run"),
3047                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3048                Some("JavaScript"),
3049                true,
3050                true,
3051            ),
3052            (
3053                Path::new("Dockerfile"),
3054                b"FROM scratch\n".as_slice(),
3055                Some("Dockerfile"),
3056                true,
3057                false,
3058            ),
3059            (
3060                Path::new("package.json"),
3061                br#"{"name":"demo"}"#.as_slice(),
3062                None,
3063                false,
3064                false,
3065            ),
3066            (
3067                Path::new("config.yaml"),
3068                b"key: value\n".as_slice(),
3069                None,
3070                false,
3071                false,
3072            ),
3073            (
3074                Path::new("Makefile"),
3075                b"all:\n\techo hi\n".as_slice(),
3076                None,
3077                false,
3078                false,
3079            ),
3080        ];
3081
3082        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3083            let classification = classify_file_info(path, bytes);
3084
3085            assert_eq!(
3086                classification.programming_language.as_deref(),
3087                expected_language,
3088                "unexpected language for {}",
3089                path.display()
3090            );
3091            assert_eq!(
3092                classification.is_source,
3093                expected_is_source,
3094                "unexpected is_source for {}",
3095                path.display()
3096            );
3097            assert_eq!(
3098                classification.is_script,
3099                expected_is_script,
3100                "unexpected is_script for {}",
3101                path.display()
3102            );
3103        }
3104    }
3105}