Skip to main content

provenant/utils/
file.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use object::FileKind;
18use quick_xml::events::Event;
19use quick_xml::reader::Reader as XmlReader;
20
21use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
22use crate::utils::font::extract_font_metadata_text;
23use crate::utils::language::detect_language;
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq)]
26pub enum ExtractedTextKind {
27    None,
28    Decoded,
29    FontMetadata,
30    Pdf,
31    BinaryStrings,
32    ImageMetadata,
33    WindowsExecutableMetadata,
34}
35
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct FileInfoClassification {
38    pub mime_type: String,
39    pub file_type: String,
40    pub programming_language: Option<String>,
41    pub is_binary: bool,
42    pub is_text: bool,
43    pub is_archive: bool,
44    pub is_media: bool,
45    pub is_source: bool,
46    pub is_script: bool,
47}
48
49const MAX_IMAGE_METADATA_VALUES: usize = 64;
50const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
51const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
52const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
53const LARGE_MACHO_LEGAL_WINDOW_BYTES: usize = 64 * 1024;
54const LARGE_MACHO_LEGAL_MAX_WINDOWS: usize = 24;
55const LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER: usize = 4;
56const LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES: usize = 2 * 1024 * 1024;
57const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
58const MAX_XMP_PACKET_BYTES: usize = 256 * 1024;
59const MAX_PDF_TEXT_EXTRACTION_BYTES: usize = 32 * 1024 * 1024;
60const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
61    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
62];
63const BINARY_EXTENSIONS: &[&str] = &[
64    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
65];
66const ARCHIVE_EXTENSIONS: &[&str] = &[
67    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
68    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
69];
70const LARGE_MACHO_LEGAL_MARKERS: &[&[u8]] = &[
71    b"Unicode, Inc.",
72    b"http://www.unicode.org/copyright.html",
73    b"https://www.unicode.org/copyright.html",
74    b"SPDX-License-Identifier:",
75    b"Licensed under",
76    b"licensed under",
77    b"Apache License",
78    b"http://www.apache.org/licenses/",
79    b"https://www.apache.org/licenses/",
80    b"Permission is hereby granted",
81    b"permission is hereby granted",
82    b"Redistribution and use in source and binary forms",
83    b"redistribution and use in source and binary forms",
84    b"Permission to use, copy, modify, and/or distribute this software",
85    b"The MIT License",
86    b"GNU GENERAL PUBLIC LICENSE",
87    b"GNU LESSER GENERAL PUBLIC LICENSE",
88    b"Mozilla Public License",
89];
90
91/// Get the last modified date of a file as a `YYYY-MM-DD` string.
92pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
93    metadata.modified().ok().map(|time: std::time::SystemTime| {
94        let seconds_since_epoch = time
95            .duration_since(std::time::UNIX_EPOCH)
96            .unwrap()
97            .as_secs() as i64;
98
99        Utc.timestamp_opt(seconds_since_epoch, 0)
100            .single()
101            .unwrap_or_else(Utc::now)
102            .format("%Y-%m-%d")
103            .to_string()
104    })
105}
106
107/// Check if a path should be excluded based on a list of glob patterns.
108pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
109    let path_str = path.to_string_lossy();
110    let file_name = path
111        .file_name()
112        .map(|name| name.to_string_lossy())
113        .unwrap_or_default();
114
115    for pattern in exclude_patterns {
116        // Match against full path
117        if pattern.matches(&path_str) {
118            return true;
119        }
120
121        // Match against just the file/directory name
122        if pattern.matches(&file_name) {
123            return true;
124        }
125    }
126
127    false
128}
129
130/// Decode a byte buffer to a String, trying UTF-16 first when the byte shape
131/// strongly suggests it, then UTF-8, then Latin-1.
132///
133/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
134/// so it can decode any byte sequence. This matches Python ScanCode's use of
135/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
136pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
137    if let Some(decoded) = decode_utf16_text(bytes) {
138        return decoded;
139    }
140
141    match String::from_utf8(bytes.to_vec()) {
142        Ok(s) => s,
143        Err(e) => {
144            let bytes = e.into_bytes();
145            if has_binary_control_chars(&bytes) {
146                return String::new();
147            }
148            bytes.iter().map(|&b| b as char).collect()
149        }
150    }
151}
152
153pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
154    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
155    (text, kind)
156}
157
158pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
159    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
160        return Cow::Borrowed(text);
161    };
162    if !matches!(
163        extension.to_ascii_lowercase().as_str(),
164        "md" | "markdown" | "html" | "htm"
165    ) {
166        return Cow::Borrowed(text);
167    }
168
169    let mut hints = Vec::new();
170    let has_dual_license_notice = has_dual_license_notice_text(text);
171    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
172        hints.push("Creative Commons Attribution 4.0 International License".to_string());
173    }
174    if !has_dual_license_notice
175        && (text.contains("Apache License (Version 2.0)")
176            || text.contains("Apache License, Version 2.0"))
177    {
178        hints.push(
179            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
180                .to_string(),
181        );
182    }
183
184    if !has_dual_license_notice {
185        hints.extend(extract_shields_license_badge_hints(text));
186    }
187
188    if hints.is_empty() {
189        Cow::Borrowed(text)
190    } else {
191        let mut augmented =
192            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
193        augmented.push_str(text);
194        augmented.push_str("\n\n");
195        for (index, hint) in hints.into_iter().enumerate() {
196            if index > 0 {
197                augmented.push('\n');
198            }
199            augmented.push_str(&hint);
200        }
201        Cow::Owned(augmented)
202    }
203}
204
205fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
206    let mut hints = Vec::new();
207    let mut rest = text;
208    let needle = "img.shields.io/badge/license-";
209
210    while let Some(index) = rest.find(needle) {
211        let start = index + needle.len();
212        let suffix = &rest[start..];
213        let end = suffix
214            .find([')', ']', '"', '\'', ' ', '\n'])
215            .unwrap_or(suffix.len());
216        let badge = &suffix[..end];
217        let Some(badge) = badge.strip_suffix(".svg") else {
218            rest = &suffix[end..];
219            continue;
220        };
221
222        let mut segments: Vec<_> = badge
223            .split('-')
224            .filter(|segment| !segment.is_empty())
225            .collect();
226        if segments.len() < 2 {
227            rest = &suffix[end..];
228            continue;
229        }
230        segments.pop();
231        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
232        if !candidate.is_empty() {
233            hints.push(canonical_shields_license_hint(&candidate));
234        }
235
236        rest = &suffix[end..];
237    }
238
239    hints.sort();
240    hints.dedup();
241    hints
242}
243
244fn has_dual_license_notice_text(text: &str) -> bool {
245    let lower = text.to_ascii_lowercase();
246    (lower.contains("licensed under either of") && lower.contains("at your option"))
247        || lower.contains("dual-licensed under")
248        || lower.contains("dual licensed under")
249}
250
251fn canonical_shields_license_hint(candidate: &str) -> String {
252    match candidate.trim() {
253        "MIT" => "The MIT License".to_string(),
254        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
255        other => format!("{other} License"),
256    }
257}
258
259pub(crate) fn extract_text_for_detection_with_diagnostics(
260    path: &Path,
261    bytes: &[u8],
262) -> (String, ExtractedTextKind, Option<String>) {
263    let ext = path
264        .extension()
265        .and_then(|e| e.to_str())
266        .map(|s| s.to_ascii_lowercase());
267    let detected_format = detect_file_format(bytes);
268
269    if looks_like_rtf(bytes, ext.as_deref()) {
270        let text = extract_rtf_text(bytes);
271        return if text.trim().is_empty() {
272            (String::new(), ExtractedTextKind::None, None)
273        } else {
274            (text, ExtractedTextKind::Decoded, None)
275        };
276    }
277
278    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
279        let (text, scan_error) = extract_pdf_text(path, bytes);
280        return if text.is_empty() {
281            (String::new(), ExtractedTextKind::None, scan_error)
282        } else {
283            (text, ExtractedTextKind::Pdf, None)
284        };
285    }
286
287    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
288        let text = extract_image_metadata_text(bytes, format);
289        return if text.is_empty() {
290            if is_supported_image_container(bytes, format) {
291                (String::new(), ExtractedTextKind::None, None)
292            } else {
293                let decoded = decode_bytes_to_string(bytes);
294                if decoded.is_empty() {
295                    (String::new(), ExtractedTextKind::None, None)
296                } else {
297                    (decoded, ExtractedTextKind::Decoded, None)
298                }
299            }
300        } else {
301            (text, ExtractedTextKind::ImageMetadata, None)
302        };
303    }
304
305    if let Some(text) = extract_font_metadata_text(path, bytes) {
306        let strings = extract_printable_strings(bytes);
307        let combined = if strings.is_empty() {
308            text
309        } else {
310            combine_extracted_text_fragments(Some(text), strings)
311        };
312        return (combined, ExtractedTextKind::FontMetadata, None);
313    }
314
315    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
316    let large_opaque_binary = windows_executable_metadata_text.is_none()
317        && is_large_opaque_binary_candidate(bytes, detected_format);
318    let bounded_macho_legal_text = if large_opaque_binary {
319        extract_bounded_macho_legal_strings(bytes)
320    } else {
321        String::new()
322    };
323    let skip_large_opaque_binary_text =
324        should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format);
325
326    if skip_large_opaque_binary_text {
327        if !bounded_macho_legal_text.is_empty() {
328            return (
329                combine_extracted_text_fragments(
330                    windows_executable_metadata_text,
331                    bounded_macho_legal_text,
332                ),
333                ExtractedTextKind::BinaryStrings,
334                None,
335            );
336        }
337        return windows_metadata_or_empty_result(windows_executable_metadata_text);
338    }
339
340    if should_skip_binary_string_extraction(path, bytes, detected_format) {
341        return (String::new(), ExtractedTextKind::None, None);
342    }
343
344    let is_svg_text = lower_extension(path).as_deref() == Some("svg")
345        || detected_format.media_type() == "image/svg+xml";
346    let should_try_decoded_text = looks_like_textual_bytes(bytes) || is_svg_text;
347    let decoded_is_utf8 = std::str::from_utf8(bytes).is_ok();
348    let path_suggests_text = ext.as_deref().is_some_and(|extension| {
349        PLAIN_TEXT_EXTENSIONS.contains(&extension) || detect_language(path, bytes).is_some()
350    });
351
352    if !large_opaque_binary && should_try_decoded_text {
353        let decoded = decode_bytes_to_string(bytes);
354        if !decoded.is_empty()
355            && (is_svg_text
356                || decoded_is_utf8
357                || path_suggests_text
358                || looks_like_decoded_text(&decoded))
359        {
360            let combined =
361                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
362            return (combined, ExtractedTextKind::Decoded, None);
363        }
364    }
365
366    let text = if large_opaque_binary {
367        let sampled_text = extract_sampled_printable_strings(bytes);
368        if bounded_macho_legal_text.is_empty() {
369            sampled_text
370        } else {
371            combine_extracted_text_fragments(Some(sampled_text), bounded_macho_legal_text)
372        }
373    } else {
374        extract_printable_strings(bytes)
375    };
376    if text.is_empty() {
377        windows_metadata_or_empty_result(windows_executable_metadata_text)
378    } else {
379        (
380            combine_extracted_text_fragments(windows_executable_metadata_text, text),
381            ExtractedTextKind::BinaryStrings,
382            None,
383        )
384    }
385}
386
387fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
388    match prefix {
389        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
390        Some(prefix) if !prefix.is_empty() => prefix,
391        _ => suffix,
392    }
393}
394
395fn windows_metadata_or_empty_result(
396    windows_executable_metadata_text: Option<String>,
397) -> (String, ExtractedTextKind, Option<String>) {
398    if let Some(metadata_text) = windows_executable_metadata_text {
399        (
400            metadata_text,
401            ExtractedTextKind::WindowsExecutableMetadata,
402            None,
403        )
404    } else {
405        (String::new(), ExtractedTextKind::None, None)
406    }
407}
408
409pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
410    let detected_format = detect_file_format(bytes);
411    let detected_language = detect_language(path, bytes);
412    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
413    let is_text = !is_binary;
414    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
415    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
416    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
417    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
418    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
419    let programming_language = is_source.then(|| detected_language.clone()).flatten();
420    let file_type = detect_file_type(
421        path,
422        bytes,
423        detected_format,
424        &mime_type,
425        programming_language.as_deref(),
426        is_binary,
427        is_text,
428        is_archive,
429        is_media,
430        is_script,
431    );
432
433    FileInfoClassification {
434        mime_type,
435        file_type,
436        programming_language,
437        is_binary,
438        is_text,
439        is_archive,
440        is_media,
441        is_source,
442        is_script,
443    }
444}
445
446fn detect_file_format(bytes: &[u8]) -> FileFormat {
447    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
448}
449
450const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
451
452fn is_utf8_text(bytes: &[u8]) -> bool {
453    std::str::from_utf8(bytes).is_ok()
454}
455
456fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
457    bytes
458        .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
459        .unwrap_or(bytes)
460}
461
462fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
463    if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
464        return None;
465    }
466
467    let code_units: Vec<u16> = bytes
468        .chunks_exact(2)
469        .map(|chunk| {
470            if is_le {
471                u16::from_le_bytes([chunk[0], chunk[1]])
472            } else {
473                u16::from_be_bytes([chunk[0], chunk[1]])
474            }
475        })
476        .collect();
477
478    let decoded = std::char::decode_utf16(code_units)
479        .collect::<Result<String, _>>()
480        .ok()?;
481
482    if !require_text_shape {
483        return (!decoded.contains('\0')).then_some(decoded);
484    }
485
486    if !looks_like_decoded_text(&decoded) {
487        return None;
488    }
489
490    Some(decoded)
491}
492
493fn looks_like_decoded_text(decoded: &str) -> bool {
494    if decoded
495        .chars()
496        .any(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
497    {
498        return false;
499    }
500
501    let visible = decoded
502        .chars()
503        .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
504        .count();
505    if visible < 3 || decoded.contains('\0') {
506        return false;
507    }
508
509    let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
510    let punctuation = decoded
511        .chars()
512        .filter(|ch| {
513            matches!(
514                ch,
515                '{' | '}'
516                    | '['
517                    | ']'
518                    | '<'
519                    | '>'
520                    | '('
521                    | ')'
522                    | ':'
523                    | ';'
524                    | ','
525                    | '"'
526                    | '\''
527                    | '/'
528                    | '='
529                    | '-'
530                    | '_'
531                    | '#'
532                    | '!'
533            )
534        })
535        .count();
536    let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
537
538    let textish = alpha + punctuation + whitespace;
539    textish + (visible / 5) >= visible && (alpha > 0 || punctuation >= 2)
540}
541
542fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
543    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
544    if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
545        return None;
546    }
547
548    let pair_count = stripped.len() / 2;
549    let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
550    let odd_zero = stripped
551        .iter()
552        .skip(1)
553        .step_by(2)
554        .filter(|&&b| b == 0)
555        .count();
556
557    let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
558    let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
559
560    match (looks_like_le, looks_like_be) {
561        (true, false) => Some(true),
562        (false, true) => Some(false),
563        (true, true) => Some(true),
564        (false, false) => None,
565    }
566}
567
568fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
569    if let Some(decoded) = decode_utf16_bom_text(bytes) {
570        return Some(decoded);
571    }
572
573    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
574    match detect_utf16_endianness(bytes) {
575        Some(true) => decode_utf16_units(stripped, true, true),
576        Some(false) => decode_utf16_units(stripped, false, true),
577        None => None,
578    }
579}
580
581fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
582    if bytes.len() >= 2 {
583        let (is_le, body) = match bytes {
584            [0xFF, 0xFE, rest @ ..] => (true, rest),
585            [0xFE, 0xFF, rest @ ..] => (false, rest),
586            _ => {
587                let stripped = strip_corrupted_utf16_bom_prefix(bytes);
588                return match detect_utf16_endianness(bytes) {
589                    Some(true) => decode_utf16_units(stripped, true, false),
590                    Some(false) => decode_utf16_units(stripped, false, false),
591                    None => None,
592                };
593            }
594        };
595
596        if body.is_empty() || !body.len().is_multiple_of(2) {
597            return None;
598        }
599
600        return decode_utf16_units(body, is_le, false);
601    }
602
603    None
604}
605
606fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
607    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
608        return None;
609    }
610
611    let (is_le, body) = match bytes {
612        [0xFF, 0xFE, rest @ ..] => (true, rest),
613        [0xFE, 0xFF, rest @ ..] => (false, rest),
614        _ => return None,
615    };
616
617    if body.is_empty() || body.len() % 2 != 0 {
618        return None;
619    }
620
621    decode_utf16_units(body, is_le, true)
622}
623
624fn has_binary_control_chars(bytes: &[u8]) -> bool {
625    let control_count = bytes
626        .iter()
627        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
628        .count();
629    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
630}
631
632fn has_decodable_text(bytes: &[u8]) -> bool {
633    bytes.is_empty()
634        || is_utf8_text(bytes)
635        || decode_utf16_text(bytes).is_some()
636        || !has_binary_control_chars(bytes)
637}
638
639fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
640    if bytes.is_empty() || is_utf8_text(bytes) {
641        return true;
642    }
643    if let Some(decoded) = decode_utf16_text(bytes) {
644        return decoded
645            .chars()
646            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
647    }
648
649    let printable_count = bytes
650        .iter()
651        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
652        .count();
653    printable_count * 2 >= bytes.len()
654}
655
656fn is_textual_media_type(media_type: &str) -> bool {
657    media_type.starts_with("text/")
658        || matches!(
659            media_type,
660            "application/json" | "application/xml" | "text/xml"
661        )
662        || media_type.ends_with("+json")
663        || media_type.ends_with("+xml")
664}
665
666fn is_textual_format(detected_format: FileFormat) -> bool {
667    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
668        || is_textual_media_type(detected_format.media_type())
669}
670
671fn is_known_binary_format(detected_format: FileFormat) -> bool {
672    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
673        && !is_textual_format(detected_format)
674}
675
676pub fn detect_mime_type(
677    path: &Path,
678    bytes: &[u8],
679    detected_format: FileFormat,
680    programming_language: Option<&str>,
681) -> String {
682    if bytes.is_empty() {
683        return "inode/x-empty".to_string();
684    }
685
686    if lower_extension(path).as_deref() == Some("json") {
687        if let Some(is_binary) = json_binary_override(bytes) {
688            if is_binary {
689                return "application/octet-stream".to_string();
690            }
691            if has_valid_json_text(bytes) {
692                return "application/json".to_string();
693            }
694            return "text/plain".to_string();
695        }
696        if has_valid_json_text(bytes) {
697            return "application/json".to_string();
698        }
699        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
700            return "text/plain".to_string();
701        }
702        return "application/octet-stream".to_string();
703    }
704
705    if is_zip_archive(bytes) {
706        return detect_zip_like_mime(path);
707    }
708
709    if looks_like_deb(bytes, path) {
710        return "application/vnd.debian.binary-package".to_string();
711    }
712
713    if looks_like_rpm(bytes, path) {
714        return "application/x-rpm".to_string();
715    }
716
717    let guessed_mime = from_path(path)
718        .first_or_octet_stream()
719        .essence_str()
720        .to_string();
721
722    let mime_type = match detected_format {
723        FileFormat::Empty => "inode/x-empty".to_string(),
724        FileFormat::PlainText => {
725            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
726                "text/plain".to_string()
727            } else {
728                guessed_mime.clone()
729            }
730        }
731        _ => {
732            let detected_mime = detected_format.media_type();
733            if detected_mime == "application/octet-stream"
734                && guessed_mime != "application/octet-stream"
735            {
736                guessed_mime.clone()
737            } else {
738                detected_mime.to_string()
739            }
740        }
741    };
742
743    normalize_mime_type(path, bytes, programming_language, &mime_type)
744}
745
746fn normalize_mime_type(
747    path: &Path,
748    bytes: &[u8],
749    programming_language: Option<&str>,
750    mime_type: &str,
751) -> String {
752    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
753        return "text/plain".to_string();
754    }
755
756    mime_type.to_string()
757}
758
759fn should_prefer_text_mime(
760    path: &Path,
761    bytes: &[u8],
762    programming_language: Option<&str>,
763    mime_type: &str,
764) -> bool {
765    has_decodable_text(bytes)
766        && looks_like_textual_bytes(bytes)
767        && is_textual_source_candidate(path, programming_language)
768        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
769}
770
771fn has_valid_json_text(bytes: &[u8]) -> bool {
772    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
773        return false;
774    }
775
776    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
777        || decode_utf16_json_text(bytes)
778            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
779            .is_some()
780}
781
782fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
783    !bytes.contains(&0)
784        && !bytes.contains(&0xFF)
785        && bytes.starts_with(b"[\"")
786        && bytes.ends_with(b"\"]")
787        && bytes.len() >= 8
788}
789
790fn json_binary_override(bytes: &[u8]) -> Option<bool> {
791    if has_valid_json_text(bytes) {
792        return Some(false);
793    }
794
795    if bytes.contains(&0) {
796        return Some(true);
797    }
798
799    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
800        return Some(true);
801    }
802
803    if is_wrapped_invalid_json_string_text(bytes) {
804        return Some(false);
805    }
806
807    None
808}
809
810fn detect_is_binary(
811    path: &Path,
812    bytes: &[u8],
813    detected_format: FileFormat,
814    programming_language: Option<&str>,
815) -> bool {
816    if lower_extension(path).as_deref() == Some("json")
817        && let Some(is_binary) = json_binary_override(bytes)
818    {
819        return is_binary;
820    }
821
822    if is_textual_format(detected_format) {
823        return false;
824    }
825
826    if lower_extension(path)
827        .as_deref()
828        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
829    {
830        return true;
831    }
832
833    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
834        return false;
835    }
836
837    has_binary_control_chars(bytes)
838        || is_known_binary_format(detected_format)
839        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
840            && !looks_like_textual_bytes(bytes))
841}
842
843fn should_treat_binary_bytes_as_text(
844    path: &Path,
845    bytes: &[u8],
846    programming_language: Option<&str>,
847) -> bool {
848    has_decodable_text(bytes)
849        && looks_like_textual_bytes(bytes)
850        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
851}
852
853fn detect_is_archive(
854    path: &Path,
855    bytes: &[u8],
856    mime_type: &str,
857    is_text: bool,
858    detected_format: FileFormat,
859) -> bool {
860    if is_text {
861        return false;
862    }
863
864    lower_extension(path)
865        .as_deref()
866        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
867        || matches!(
868            detected_format.kind(),
869            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
870        )
871        || is_zip_archive(bytes)
872        || looks_like_gzip(bytes)
873        || looks_like_bzip2(bytes)
874        || looks_like_xz(bytes)
875        || looks_like_deb(bytes, path)
876        || looks_like_rpm(bytes, path)
877        || looks_like_squashfs(bytes, path)
878        || mime_type.contains("zip")
879        || mime_type.contains("compressed")
880        || mime_type.contains("tar")
881        || mime_type.contains("x-rpm")
882        || mime_type.contains("debian")
883}
884
885fn detect_is_media(
886    path: &Path,
887    bytes: &[u8],
888    mime_type: &str,
889    detected_format: FileFormat,
890) -> bool {
891    media_mime_from_content(bytes).is_some()
892        || matches!(
893            detected_format.kind(),
894            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
895        )
896        || mime_type.starts_with("image/")
897        || mime_type.starts_with("audio/")
898        || mime_type.starts_with("video/")
899        || (mime_type == "application/octet-stream"
900            && lower_extension(path).as_deref() == Some("tga")
901            && !has_binary_control_chars(bytes))
902}
903
904fn detect_is_script(
905    path: &Path,
906    bytes: &[u8],
907    programming_language: Option<&str>,
908    is_text: bool,
909) -> bool {
910    if !is_text || is_makefile(path) {
911        return false;
912    }
913
914    bytes.starts_with(b"#!")
915        || lower_extension(path).as_deref().is_some_and(|ext| {
916            matches!(
917                ext,
918                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
919            )
920        })
921        || matches!(
922            programming_language,
923            Some(
924                "Shell"
925                    | "Bash"
926                    | "Zsh"
927                    | "Fish"
928                    | "Ksh"
929                    | "Python"
930                    | "Ruby"
931                    | "Perl"
932                    | "PHP"
933                    | "PowerShell"
934                    | "Awk"
935            )
936        )
937}
938
939fn detect_is_source(
940    path: &Path,
941    programming_language: Option<&str>,
942    is_text: bool,
943    is_script: bool,
944) -> bool {
945    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
946        return false;
947    }
948
949    if is_c_like_source(path) || is_java_like_source(path) {
950        return true;
951    }
952
953    programming_language.is_some() || is_script
954}
955
956#[allow(clippy::too_many_arguments)]
957fn detect_file_type(
958    path: &Path,
959    bytes: &[u8],
960    detected_format: FileFormat,
961    mime_type: &str,
962    programming_language: Option<&str>,
963    is_binary: bool,
964    is_text: bool,
965    is_archive: bool,
966    is_media: bool,
967    is_script: bool,
968) -> String {
969    if bytes.is_empty() {
970        return "empty".to_string();
971    }
972
973    if looks_like_pdf(bytes) {
974        return "PDF document".to_string();
975    }
976
977    if let Some(file_type) = media_file_type_from_content(bytes) {
978        return file_type.to_string();
979    }
980
981    if is_archive {
982        return archive_file_type(path, bytes, detected_format);
983    }
984
985    if is_script {
986        return script_file_type(programming_language, bytes);
987    }
988
989    if is_text {
990        if lower_extension(path).as_deref() == Some("json") {
991            if has_valid_json_text(bytes) {
992                return "JSON text data".to_string();
993            }
994            return text_file_type(bytes);
995        }
996        if lower_extension(path).as_deref() == Some("xml") {
997            return "XML text data".to_string();
998        }
999        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
1000            return "YAML text data".to_string();
1001        }
1002        if lower_extension(path).as_deref() == Some("toml") {
1003            return "TOML text data".to_string();
1004        }
1005        if matches!(
1006            lower_extension(path).as_deref(),
1007            Some("ini" | "cfg" | "conf")
1008        ) {
1009            return "INI text data".to_string();
1010        }
1011        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
1012            return "Git configuration text".to_string();
1013        }
1014        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
1015            return text_file_type(bytes);
1016        }
1017        if programming_language.is_some() && !is_media {
1018            return source_file_type(programming_language, bytes);
1019        }
1020        return text_file_type(bytes);
1021    }
1022
1023    if let Some(file_type) = format_based_file_type(detected_format) {
1024        return file_type;
1025    }
1026
1027    if is_binary && mime_type == "application/octet-stream" {
1028        return "data".to_string();
1029    }
1030
1031    mime_type.to_string()
1032}
1033
1034fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
1035    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
1036        return true;
1037    }
1038
1039    if matches!(
1040        lower_file_name(path).as_str(),
1041        "dockerfile"
1042            | "containerfile"
1043            | "containerfile.core"
1044            | "apkbuild"
1045            | "podfile"
1046            | "jamfile"
1047            | "jamroot"
1048            | "meson.build"
1049            | "build"
1050            | "workspace"
1051            | "buck"
1052            | "default.nix"
1053            | "flake.nix"
1054            | "shell.nix"
1055    ) {
1056        return true;
1057    }
1058
1059    path.extension()
1060        .and_then(|ext| ext.to_str())
1061        .is_some_and(|ext| {
1062            matches!(
1063                ext.to_ascii_lowercase().as_str(),
1064                "rs" | "py"
1065                    | "js"
1066                    | "mjs"
1067                    | "cjs"
1068                    | "jsx"
1069                    | "ts"
1070                    | "mts"
1071                    | "cts"
1072                    | "tsx"
1073                    | "c"
1074                    | "cpp"
1075                    | "cc"
1076                    | "cxx"
1077                    | "h"
1078                    | "hpp"
1079                    | "m"
1080                    | "mm"
1081                    | "s"
1082                    | "asm"
1083                    | "java"
1084                    | "go"
1085                    | "rb"
1086                    | "php"
1087                    | "pl"
1088                    | "swift"
1089                    | "sh"
1090                    | "bash"
1091                    | "zsh"
1092                    | "fish"
1093                    | "ksh"
1094                    | "ps1"
1095                    | "psm1"
1096                    | "psd1"
1097                    | "awk"
1098                    | "kt"
1099                    | "kts"
1100                    | "dart"
1101                    | "scala"
1102                    | "groovy"
1103                    | "gradle"
1104                    | "gvy"
1105                    | "gy"
1106                    | "gsh"
1107                    | "cs"
1108                    | "fs"
1109                    | "fsx"
1110                    | "r"
1111                    | "lua"
1112                    | "jl"
1113                    | "ex"
1114                    | "exs"
1115                    | "clj"
1116                    | "cljs"
1117                    | "cljc"
1118                    | "hs"
1119                    | "erl"
1120                    | "nix"
1121                    | "zig"
1122                    | "bzl"
1123                    | "bazel"
1124                    | "star"
1125                    | "sky"
1126                    | "ml"
1127                    | "mli"
1128                    | "tex"
1129            )
1130        })
1131}
1132
1133fn is_source_like_language(language: &str) -> bool {
1134    matches!(
1135        language,
1136        "Rust"
1137            | "Python"
1138            | "JavaScript"
1139            | "TypeScript"
1140            | "JavaScript/TypeScript"
1141            | "C"
1142            | "C++"
1143            | "Objective-C"
1144            | "Objective-C++"
1145            | "GAS"
1146            | "Java"
1147            | "Go"
1148            | "Ruby"
1149            | "PHP"
1150            | "Perl"
1151            | "Swift"
1152            | "Shell"
1153            | "PowerShell"
1154            | "Awk"
1155            | "Kotlin"
1156            | "Dart"
1157            | "Scala"
1158            | "C#"
1159            | "F#"
1160            | "R"
1161            | "Lua"
1162            | "Julia"
1163            | "Elixir"
1164            | "Clojure"
1165            | "Haskell"
1166            | "Erlang"
1167            | "Groovy"
1168            | "Nix"
1169            | "Zig"
1170            | "Starlark"
1171            | "OCaml"
1172            | "Meson"
1173            | "TeX"
1174            | "Dockerfile"
1175            | "Makefile"
1176            | "Jamfile"
1177    )
1178}
1179
1180fn extension(path: &Path) -> Option<&str> {
1181    path.extension().and_then(|ext| ext.to_str())
1182}
1183
1184fn lower_extension(path: &Path) -> Option<String> {
1185    extension(path).map(|ext| ext.to_ascii_lowercase())
1186}
1187
1188fn lower_file_name(path: &Path) -> String {
1189    path.file_name()
1190        .and_then(|name| name.to_str())
1191        .map(|name| name.to_ascii_lowercase())
1192        .unwrap_or_default()
1193}
1194
1195fn is_plain_text(path: &Path) -> bool {
1196    lower_extension(path)
1197        .as_deref()
1198        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1199}
1200
1201fn is_makefile(path: &Path) -> bool {
1202    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1203}
1204
1205fn is_source_map(path: &Path) -> bool {
1206    let path_lower = path.to_string_lossy().to_ascii_lowercase();
1207    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1208}
1209
1210fn is_c_like_source(path: &Path) -> bool {
1211    lower_extension(path).as_deref().is_some_and(|ext| {
1212        matches!(
1213            ext,
1214            "c" | "cc"
1215                | "cp"
1216                | "cpp"
1217                | "cxx"
1218                | "c++"
1219                | "h"
1220                | "hh"
1221                | "hpp"
1222                | "hxx"
1223                | "h++"
1224                | "i"
1225                | "ii"
1226                | "m"
1227                | "s"
1228                | "asm"
1229        )
1230    })
1231}
1232
1233fn is_java_like_source(path: &Path) -> bool {
1234    lower_extension(path)
1235        .as_deref()
1236        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1237}
1238
1239fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1240    match detected_format {
1241        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1242        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1243        format => Some(match format.kind() {
1244            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1245            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1246            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1247            _ => format.name().to_string(),
1248        }),
1249    }
1250}
1251
1252fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1253    format
1254        .short_name()
1255        .map(|short_name| format!("{short_name} {suffix}"))
1256        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1257}
1258
1259fn detect_zip_like_mime(path: &Path) -> String {
1260    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1261        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1262        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1263            "application/java-archive".to_string()
1264        }
1265        _ => "application/zip".to_string(),
1266    }
1267}
1268
1269fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1270    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1271        Some("image/png")
1272    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1273        Some("image/jpeg")
1274    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1275        Some("image/tiff")
1276    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1277        Some("image/webp")
1278    } else {
1279        None
1280    }
1281}
1282
1283fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1284    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1285        Some("PNG image data")
1286    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1287        Some("JPEG image data")
1288    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1289        Some("TIFF image data")
1290    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1291        Some("WebP image data")
1292    } else {
1293        None
1294    }
1295}
1296
1297fn looks_like_pdf(bytes: &[u8]) -> bool {
1298    bytes.starts_with(b"%PDF-")
1299}
1300
1301fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1302    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1303}
1304
1305fn extract_rtf_text(bytes: &[u8]) -> String {
1306    let text = String::from_utf8_lossy(bytes);
1307    let chars: Vec<char> = text.chars().collect();
1308    let mut output = String::new();
1309    let mut index = 0usize;
1310
1311    while index < chars.len() {
1312        match chars[index] {
1313            '{' | '}' => {
1314                index += 1;
1315            }
1316            '\\' => {
1317                index += 1;
1318                if index >= chars.len() {
1319                    break;
1320                }
1321
1322                match chars[index] {
1323                    '\\' | '{' | '}' => {
1324                        output.push(chars[index]);
1325                        index += 1;
1326                    }
1327                    '\'' => {
1328                        if index + 2 < chars.len() {
1329                            let hex = [chars[index + 1], chars[index + 2]];
1330                            let hex: String = hex.iter().collect();
1331                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1332                                output.push(value as char);
1333                                index += 3;
1334                                continue;
1335                            }
1336                        }
1337                        index += 1;
1338                    }
1339                    control if control.is_ascii_alphabetic() => {
1340                        let start = index;
1341                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1342                            index += 1;
1343                        }
1344                        let control_word: String = chars[start..index].iter().collect();
1345
1346                        let number_start = index;
1347                        if index < chars.len()
1348                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1349                        {
1350                            index += 1;
1351                            while index < chars.len() && chars[index].is_ascii_digit() {
1352                                index += 1;
1353                            }
1354                        }
1355                        let parameter: String = chars[number_start..index].iter().collect();
1356
1357                        if index < chars.len() && chars[index] == ' ' {
1358                            index += 1;
1359                        }
1360
1361                        match control_word.as_str() {
1362                            "par" | "line" => output.push('\n'),
1363                            "tab" => output.push('\t'),
1364                            "emdash" => output.push('—'),
1365                            "endash" => output.push('–'),
1366                            "bullet" => output.push('•'),
1367                            "lquote" | "rquote" => output.push('\''),
1368                            "ldblquote" | "rdblquote" => output.push('"'),
1369                            "u" => {
1370                                if let Ok(codepoint) = parameter.parse::<i32>() {
1371                                    let normalized = if codepoint < 0 {
1372                                        codepoint + 65_536
1373                                    } else {
1374                                        codepoint
1375                                    };
1376                                    if let Ok(normalized) = u32::try_from(normalized)
1377                                        && let Some(ch) = char::from_u32(normalized)
1378                                    {
1379                                        output.push(ch);
1380                                    }
1381                                }
1382
1383                                if index < chars.len()
1384                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1385                                {
1386                                    index += 1;
1387                                }
1388                            }
1389                            _ => {}
1390                        }
1391                    }
1392                    _ => {
1393                        index += 1;
1394                    }
1395                }
1396            }
1397            ch => {
1398                output.push(ch);
1399                index += 1;
1400            }
1401        }
1402    }
1403
1404    output
1405        .replace(['\r', '\u{0c}'], "\n")
1406        .lines()
1407        .map(str::trim_end)
1408        .collect::<Vec<_>>()
1409        .join("\n")
1410}
1411
1412fn looks_like_gzip(bytes: &[u8]) -> bool {
1413    bytes.starts_with(&[0x1f, 0x8b])
1414}
1415
1416fn looks_like_bzip2(bytes: &[u8]) -> bool {
1417    bytes.starts_with(b"BZh")
1418}
1419
1420fn looks_like_xz(bytes: &[u8]) -> bool {
1421    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1422}
1423
1424fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1425    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1426}
1427
1428fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1429    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1430}
1431
1432fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1433    lower_extension(path)
1434        .as_deref()
1435        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1436        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1437            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1438}
1439
1440fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1441    if looks_like_deb(bytes, path) {
1442        "debian binary package (format 2.0)".to_string()
1443    } else if looks_like_rpm(bytes, path) {
1444        "RPM package".to_string()
1445    } else if looks_like_squashfs(bytes, path) {
1446        "Squashfs filesystem".to_string()
1447    } else if looks_like_gzip(bytes) {
1448        "gzip compressed data".to_string()
1449    } else if looks_like_bzip2(bytes) {
1450        "bzip2 compressed data".to_string()
1451    } else if looks_like_xz(bytes) {
1452        "XZ compressed data".to_string()
1453    } else if is_zip_archive(bytes) {
1454        "Zip archive data".to_string()
1455    } else if lower_extension(path).as_deref() == Some("gem") {
1456        "POSIX tar archive".to_string()
1457    } else if let Some(file_type) = format_based_file_type(detected_format) {
1458        file_type
1459    } else {
1460        "archive data".to_string()
1461    }
1462}
1463
1464fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1465    let suffix = text_executable_label(bytes);
1466
1467    match programming_language {
1468        Some("Python") => format!("python script, {suffix}"),
1469        Some("Ruby") => format!("ruby script, {suffix}"),
1470        Some("Perl") => format!("perl script, {suffix}"),
1471        Some("PHP") => format!("php script, {suffix}"),
1472        Some("Shell") => format!("shell script, {suffix}"),
1473        Some("Bash") => format!("bash script, {suffix}"),
1474        Some("Zsh") => format!("zsh script, {suffix}"),
1475        Some("Fish") => format!("fish script, {suffix}"),
1476        Some("Ksh") => format!("ksh script, {suffix}"),
1477        Some("JavaScript") => format!("javascript script, {suffix}"),
1478        Some("TypeScript") => format!("typescript script, {suffix}"),
1479        Some("PowerShell") => format!("powershell script, {suffix}"),
1480        Some("Awk") => format!("awk script, {suffix}"),
1481        _ => format!("script, {suffix}"),
1482    }
1483}
1484
1485fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1486    let suffix = text_label(bytes);
1487    match programming_language {
1488        Some("C") => format!("C source, {suffix}"),
1489        Some("C++") => format!("C++ source, {suffix}"),
1490        Some("Java") => format!("Java source, {suffix}"),
1491        Some("C#") => format!("C# source, {suffix}"),
1492        Some("F#") => format!("F# source, {suffix}"),
1493        Some("Go") => format!("Go source, {suffix}"),
1494        Some("Rust") => format!("Rust source, {suffix}"),
1495        Some("Starlark") => format!("Starlark source, {suffix}"),
1496        Some("CMake") => format!("CMake source, {suffix}"),
1497        Some("Meson") => format!("Meson source, {suffix}"),
1498        Some("Nix") => format!("Nix source, {suffix}"),
1499        Some("Groovy") => format!("Groovy source, {suffix}"),
1500        Some("Makefile") => format!("Makefile source, {suffix}"),
1501        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1502        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1503        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1504        Some(language) => format!("{language} source, {suffix}"),
1505        None => text_file_type(bytes),
1506    }
1507}
1508
1509fn text_file_type(bytes: &[u8]) -> String {
1510    text_label(bytes).to_string()
1511}
1512
1513fn text_label(bytes: &[u8]) -> &'static str {
1514    if std::str::from_utf8(bytes).is_ok() {
1515        if bytes.contains(&b'\n') {
1516            "UTF-8 Unicode text"
1517        } else {
1518            "UTF-8 Unicode text, with no line terminators"
1519        }
1520    } else if bytes.contains(&b'\n') {
1521        "text"
1522    } else {
1523        "text, with no line terminators"
1524    }
1525}
1526
1527fn text_executable_label(bytes: &[u8]) -> &'static str {
1528    if std::str::from_utf8(bytes).is_ok() {
1529        if bytes.contains(&b'\n') {
1530            "UTF-8 Unicode text executable"
1531        } else {
1532            "UTF-8 Unicode text executable, with no line terminators"
1533        }
1534    } else if bytes.contains(&b'\n') {
1535        "text executable"
1536    } else {
1537        "text executable, with no line terminators"
1538    }
1539}
1540
1541fn supported_image_metadata_format(
1542    ext: Option<&str>,
1543    detected_format: FileFormat,
1544) -> Option<ImageFormat> {
1545    match ext {
1546        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1547        Some("png") => Some(ImageFormat::Png),
1548        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1549        Some("webp") => Some(ImageFormat::WebP),
1550        _ => match detected_format.media_type() {
1551            "image/jpeg" => Some(ImageFormat::Jpeg),
1552            "image/png" => Some(ImageFormat::Png),
1553            "image/tiff" => Some(ImageFormat::Tiff),
1554            "image/webp" => Some(ImageFormat::WebP),
1555            _ => None,
1556        },
1557    }
1558}
1559
1560fn should_skip_binary_string_extraction(
1561    path: &Path,
1562    bytes: &[u8],
1563    detected_format: FileFormat,
1564) -> bool {
1565    matches!(lower_extension(path).as_deref(), Some("pdf"))
1566        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1567            .is_some()
1568        || (matches!(
1569            detected_format.kind(),
1570            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1571        ) && !is_textual_format(detected_format))
1572        || media_mime_from_content(bytes).is_some()
1573        || is_zip_archive(bytes)
1574        || looks_like_gzip(bytes)
1575        || looks_like_bzip2(bytes)
1576        || looks_like_xz(bytes)
1577        || looks_like_deb(bytes, path)
1578        || looks_like_rpm(bytes, path)
1579        || looks_like_squashfs(bytes, path)
1580}
1581
1582fn should_skip_large_opaque_binary_text_extraction(
1583    _path: &Path,
1584    bytes: &[u8],
1585    detected_format: FileFormat,
1586) -> bool {
1587    is_large_opaque_binary_candidate(bytes, detected_format)
1588        && !sample_has_promising_printable_strings(bytes)
1589}
1590
1591fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1592    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1593        && !is_textual_format(detected_format)
1594        && !matches!(
1595            detected_format.kind(),
1596            FileFormatKind::Archive
1597                | FileFormatKind::Compressed
1598                | FileFormatKind::Package
1599                | FileFormatKind::Audio
1600                | FileFormatKind::Image
1601                | FileFormatKind::Video
1602        )
1603}
1604
1605fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1606    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1607
1608    let mut ranges = Vec::new();
1609    let mut push_range = |start: usize, end: usize| {
1610        if start < end && !ranges.contains(&(start, end)) {
1611            ranges.push((start, end));
1612        }
1613    };
1614
1615    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1616    if len > SAMPLE_WINDOW_BYTES * 2 {
1617        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1618        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1619        push_range(mid_start, mid_end);
1620    }
1621    if len > SAMPLE_WINDOW_BYTES {
1622        push_range(len - SAMPLE_WINDOW_BYTES, len);
1623    }
1624
1625    ranges
1626}
1627
1628fn extract_bounded_macho_legal_strings(bytes: &[u8]) -> String {
1629    if !matches!(
1630        FileKind::parse(bytes),
1631        Ok(FileKind::MachO32 | FileKind::MachO64 | FileKind::MachOFat32 | FileKind::MachOFat64)
1632    ) {
1633        return String::new();
1634    }
1635
1636    let mut ranges = Vec::new();
1637    for marker in LARGE_MACHO_LEGAL_MARKERS {
1638        collect_marker_window_ranges(bytes, marker, &mut ranges);
1639        if ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
1640            break;
1641        }
1642    }
1643
1644    if ranges.is_empty() {
1645        return String::new();
1646    }
1647
1648    let mut merged_ranges = merge_overlapping_ranges(ranges);
1649    let mut combined_lines = BTreeSet::new();
1650    let mut extracted_bytes = 0usize;
1651
1652    for (start, end) in merged_ranges.drain(..) {
1653        if extracted_bytes >= LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES {
1654            break;
1655        }
1656        let remaining = LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES - extracted_bytes;
1657        let end = start.saturating_add((end - start).min(remaining));
1658        let window_text = extract_printable_strings(&bytes[start..end]);
1659        for line in window_text
1660            .lines()
1661            .map(str::trim)
1662            .filter(|line| !line.is_empty())
1663        {
1664            combined_lines.insert(line.to_string());
1665        }
1666        extracted_bytes += end - start;
1667    }
1668
1669    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1670}
1671
1672fn collect_marker_window_ranges(bytes: &[u8], marker: &[u8], ranges: &mut Vec<(usize, usize)>) {
1673    if marker.is_empty() || ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
1674        return;
1675    }
1676
1677    let mut search_start = 0usize;
1678    let mut hits_for_marker = 0usize;
1679
1680    while search_start + marker.len() <= bytes.len()
1681        && ranges.len() < LARGE_MACHO_LEGAL_MAX_WINDOWS
1682        && hits_for_marker < LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER
1683    {
1684        let Some(relative_match) = bytes[search_start..].iter().position(|&b| b == marker[0])
1685        else {
1686            break;
1687        };
1688        let match_start = search_start + relative_match;
1689        let match_end = match_start + marker.len();
1690        if match_end <= bytes.len() && &bytes[match_start..match_end] == marker {
1691            let half_window = LARGE_MACHO_LEGAL_WINDOW_BYTES / 2;
1692            let window_start = match_start.saturating_sub(half_window);
1693            let window_end = (match_end + half_window).min(bytes.len());
1694            ranges.push((window_start, window_end));
1695            hits_for_marker += 1;
1696            search_start = match_end;
1697        } else {
1698            search_start = match_start + 1;
1699        }
1700    }
1701}
1702
1703fn merge_overlapping_ranges(mut ranges: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
1704    if ranges.is_empty() {
1705        return ranges;
1706    }
1707
1708    ranges.sort_unstable_by_key(|&(start, end)| (start, end));
1709
1710    let mut merged = Vec::with_capacity(ranges.len());
1711    let mut current = ranges[0];
1712    for (start, end) in ranges.into_iter().skip(1) {
1713        if start <= current.1 {
1714            current.1 = current.1.max(end);
1715        } else {
1716            merged.push(current);
1717            current = (start, end);
1718        }
1719    }
1720    merged.push(current);
1721
1722    merged
1723}
1724
1725fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1726    let mut structured_signal_seen = false;
1727    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1728        .into_iter()
1729        .filter(|&(start, end)| {
1730            let window = &bytes[start..end];
1731            if has_strong_structured_text_signal(window) {
1732                structured_signal_seen = true;
1733            }
1734            has_license_or_notice_signal(window)
1735        })
1736        .count();
1737
1738    structured_signal_seen || promising_license_windows >= 2
1739}
1740
1741fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1742    let mut combined_lines = BTreeSet::new();
1743
1744    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1745        let window_text = extract_printable_strings(&bytes[start..end]);
1746        for line in window_text
1747            .lines()
1748            .map(str::trim)
1749            .filter(|line| !line.is_empty())
1750        {
1751            combined_lines.insert(line.to_string());
1752        }
1753    }
1754
1755    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1756}
1757
1758fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1759    let strings = extract_printable_strings(bytes);
1760    if strings.is_empty() {
1761        return false;
1762    }
1763
1764    let lower = strings.to_ascii_lowercase();
1765    [
1766        "copyright",
1767        "license",
1768        "licensed under",
1769        "all rights reserved",
1770        "permission is hereby granted",
1771        "redistribution and use",
1772        "spdx-license-identifier",
1773    ]
1774    .iter()
1775    .any(|marker| lower.contains(marker))
1776}
1777
1778fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1779    let strings = extract_printable_strings(bytes);
1780    if strings.is_empty() {
1781        return false;
1782    }
1783
1784    let email_markers = strings.matches('@').count();
1785    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1786
1787    email_markers + url_markers >= 3
1788}
1789
1790fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1791    match format {
1792        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1793        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1794        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1795        ImageFormat::WebP => {
1796            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1797        }
1798        _ => false,
1799    }
1800}
1801
1802fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1803    let mut values = Vec::new();
1804    values.extend(extract_exif_metadata_values(bytes));
1805    values.extend(extract_xmp_metadata_values(bytes, format));
1806    values_to_text(values)
1807}
1808
1809fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1810    let mut cursor = BufReader::new(Cursor::new(bytes));
1811    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1812        Ok(exif) => exif,
1813        Err(_) => return Vec::new(),
1814    };
1815
1816    let mut values = Vec::new();
1817    for field in exif.fields() {
1818        let rendered = match field.tag {
1819            exif::Tag::ImageDescription => Some(format_metadata_field(
1820                "Description",
1821                &field.display_value().with_unit(&exif).to_string(),
1822            )),
1823            exif::Tag::Copyright => Some(format_metadata_field(
1824                "Copyright",
1825                &field.display_value().with_unit(&exif).to_string(),
1826            )),
1827            exif::Tag::UserComment => Some(format_metadata_field(
1828                "Comment",
1829                &field.display_value().with_unit(&exif).to_string(),
1830            )),
1831            exif::Tag::Artist => Some(format_metadata_field(
1832                "Author",
1833                &field.display_value().with_unit(&exif).to_string(),
1834            )),
1835            _ => None,
1836        };
1837
1838        if let Some(rendered) = rendered {
1839            values.push(rendered);
1840        }
1841    }
1842
1843    values
1844}
1845
1846fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1847    let xmp = match extract_raw_xmp_packet(bytes, format) {
1848        Some(xmp) => xmp,
1849        None => return Vec::new(),
1850    };
1851
1852    parse_xmp_values(&xmp)
1853}
1854
1855fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1856    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1857    if let Ok(mut decoder) = reader.into_decoder()
1858        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1859    {
1860        return (xmp.len() <= MAX_XMP_PACKET_BYTES).then_some(xmp);
1861    }
1862
1863    match format {
1864        ImageFormat::Png => extract_png_xmp_packet(bytes),
1865        _ => None,
1866    }
1867}
1868
1869fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1870    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1871
1872    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1873        return None;
1874    }
1875
1876    let mut offset = PNG_SIGNATURE.len();
1877    while offset + 12 <= bytes.len() {
1878        let length = u32::from_be_bytes([
1879            bytes[offset],
1880            bytes[offset + 1],
1881            bytes[offset + 2],
1882            bytes[offset + 3],
1883        ]) as usize;
1884        let chunk_start = offset + 8;
1885        let chunk_end = chunk_start + length;
1886        if chunk_end + 4 > bytes.len() {
1887            return None;
1888        }
1889
1890        let chunk_type = &bytes[offset + 4..offset + 8];
1891        if chunk_type == b"iTXt" {
1892            let data = &bytes[chunk_start..chunk_end];
1893            if let Some(xmp) = parse_png_itxt_xmp(data) {
1894                return Some(xmp);
1895            }
1896        }
1897
1898        offset = chunk_end + 4;
1899    }
1900
1901    None
1902}
1903
1904fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1905    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1906
1907    let keyword_end = data.iter().position(|&b| b == 0)?;
1908    if &data[..keyword_end] != XMP_KEYWORD {
1909        return None;
1910    }
1911
1912    let mut cursor = keyword_end + 1;
1913    let compression_flag = *data.get(cursor)?;
1914    cursor += 1;
1915    let compression_method = *data.get(cursor)?;
1916    cursor += 1;
1917    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1918        return None;
1919    }
1920
1921    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1922    cursor = language_end + 1;
1923
1924    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1925    cursor = translated_end + 1;
1926
1927    let text_bytes = &data[cursor..];
1928    if compression_flag == 1 {
1929        let decoder = ZlibDecoder::new(text_bytes);
1930        let mut decoded = Vec::new();
1931        decoder
1932            .take((MAX_XMP_PACKET_BYTES + 1) as u64)
1933            .read_to_end(&mut decoded)
1934            .ok()?;
1935        (decoded.len() <= MAX_XMP_PACKET_BYTES).then_some(decoded)
1936    } else {
1937        (text_bytes.len() <= MAX_XMP_PACKET_BYTES).then(|| text_bytes.to_vec())
1938    }
1939}
1940
1941fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1942    let mut reader = XmlReader::from_reader(xmp);
1943    reader.config_mut().trim_text(true);
1944
1945    let mut buf = Vec::new();
1946    let mut stack: Vec<String> = Vec::new();
1947    let mut values = Vec::new();
1948
1949    loop {
1950        match reader.read_event_into(&mut buf) {
1951            Ok(Event::Start(e)) => {
1952                stack.push(local_xml_name(e.name().as_ref()));
1953            }
1954            Ok(Event::End(_)) => {
1955                stack.pop();
1956            }
1957            Ok(Event::Empty(_)) => {}
1958            Ok(Event::Text(text)) => {
1959                if let Some(field) = stack
1960                    .iter()
1961                    .rev()
1962                    .find_map(|name| allowed_xmp_field(name.as_str()))
1963                    && let Ok(decoded) = text.decode()
1964                {
1965                    let decoded = decoded.into_owned();
1966                    if !decoded.trim().is_empty() {
1967                        values.push(format_xmp_value(field, &decoded));
1968                    }
1969                }
1970            }
1971            Ok(Event::CData(text)) => {
1972                if let Some(field) = stack
1973                    .iter()
1974                    .rev()
1975                    .find_map(|name| allowed_xmp_field(name.as_str()))
1976                    && let Ok(decoded) = text.decode()
1977                {
1978                    let decoded = decoded.into_owned();
1979                    if !decoded.trim().is_empty() {
1980                        values.push(format_xmp_value(field, &decoded));
1981                    }
1982                }
1983            }
1984            Ok(Event::Eof) | Err(_) => break,
1985            _ => {}
1986        }
1987        buf.clear();
1988    }
1989
1990    values
1991}
1992
1993fn local_xml_name(name: &[u8]) -> String {
1994    let name = std::str::from_utf8(name).unwrap_or_default();
1995    name.rsplit(':').next().unwrap_or(name).to_string()
1996}
1997
1998fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1999    match name {
2000        "creator" => Some("creator"),
2001        "rights" => Some("rights"),
2002        "description" => Some("description"),
2003        "title" => Some("title"),
2004        "subject" => Some("subject"),
2005        "UsageTerms" => Some("usage_terms"),
2006        "WebStatement" => Some("web_statement"),
2007        _ => None,
2008    }
2009}
2010
2011fn format_xmp_value(field: &str, value: &str) -> String {
2012    match field {
2013        "creator" => format_metadata_field("Author", value),
2014        "rights" => format_metadata_field("Copyright", value),
2015        "description" => format_metadata_field("Description", value),
2016        "title" => format_metadata_field("Title", value),
2017        "subject" => format_metadata_field("Subject", value),
2018        "usage_terms" => format_metadata_field("UsageTerms", value),
2019        "web_statement" => format_metadata_field("WebStatement", value),
2020        _ => value.to_string(),
2021    }
2022}
2023
2024fn format_metadata_field(label: &str, value: &str) -> String {
2025    format!("{label}: {value}")
2026}
2027
2028fn values_to_text(values: Vec<String>) -> String {
2029    let mut seen = BTreeSet::new();
2030    let mut normalized_lines = Vec::new();
2031
2032    for value in values {
2033        let normalized = normalize_metadata_value(&value);
2034        if normalized.is_empty() || !seen.insert(normalized.clone()) {
2035            continue;
2036        }
2037
2038        normalized_lines.push(normalized);
2039    }
2040
2041    let author_values: BTreeSet<String> = normalized_lines
2042        .iter()
2043        .filter_map(|line| split_metadata_field(line))
2044        .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
2045        .map(|(_, value)| value.to_string())
2046        .collect();
2047
2048    let mut lines = Vec::new();
2049    let mut total_bytes = 0usize;
2050
2051    for normalized in normalized_lines {
2052        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
2053            break;
2054        }
2055
2056        if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
2057            continue;
2058        }
2059
2060        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
2061        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
2062            break;
2063        }
2064
2065        total_bytes += added_bytes;
2066        lines.push(normalized);
2067    }
2068
2069    lines.join("\n")
2070}
2071
2072fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
2073    let (label, value) = line.split_once(':')?;
2074    Some((label.trim(), value.trim()))
2075}
2076
2077fn should_suppress_bare_copyright_metadata_line(
2078    line: &str,
2079    author_values: &BTreeSet<String>,
2080) -> bool {
2081    let Some((label, value)) = split_metadata_field(line) else {
2082        return false;
2083    };
2084    if !label.eq_ignore_ascii_case("Copyright")
2085        || value.is_empty()
2086        || !author_values.contains(value)
2087    {
2088        return false;
2089    }
2090
2091    let lower = value.to_ascii_lowercase();
2092    !lower.contains("copyright")
2093        && !lower.contains("(c)")
2094        && !lower.contains('©')
2095        && !lower.contains("all rights")
2096        && !value.chars().any(|ch| ch.is_ascii_digit())
2097}
2098
2099fn normalize_metadata_value(value: &str) -> String {
2100    value
2101        .chars()
2102        .filter(|&ch| ch != '\0')
2103        .collect::<String>()
2104        .split_whitespace()
2105        .collect::<Vec<_>>()
2106        .join(" ")
2107        .trim()
2108        .to_string()
2109}
2110
2111fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
2112    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
2113        return (String::new(), None);
2114    }
2115
2116    if bytes.len() > MAX_PDF_TEXT_EXTRACTION_BYTES {
2117        return (
2118            String::new(),
2119            Some(format!(
2120                "PDF text extraction skipped because file exceeds {} bytes",
2121                MAX_PDF_TEXT_EXTRACTION_BYTES
2122            )),
2123        );
2124    }
2125
2126    let mut failures = Vec::new();
2127    let mut saw_success = false;
2128
2129    let extracted = catch_unwind(AssertUnwindSafe(
2130        || -> Result<String, Box<dyn std::error::Error>> {
2131            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2132            extract_first_pdf_page_text(&mut document)
2133        },
2134    ));
2135    match extracted {
2136        Ok(Ok(text)) => {
2137            saw_success = true;
2138            if let Some(normalized) = normalize_pdf_text(text) {
2139                return (normalized, None);
2140            }
2141        }
2142        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
2143        Err(payload) => failures.push(format!(
2144            "from-bytes first-page panic: {}",
2145            panic_payload_to_string(payload.as_ref())
2146        )),
2147    }
2148
2149    let extracted = catch_unwind(AssertUnwindSafe(
2150        || -> Result<String, Box<dyn std::error::Error>> {
2151            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
2152            extract_pdf_text_from_document(&mut document)
2153        },
2154    ));
2155    match extracted {
2156        Ok(Ok(text)) => {
2157            saw_success = true;
2158            if let Some(normalized) = normalize_pdf_text(text) {
2159                return (normalized, None);
2160            }
2161        }
2162        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
2163        Err(payload) => failures.push(format!(
2164            "open full-document panic: {}",
2165            panic_payload_to_string(payload.as_ref())
2166        )),
2167    }
2168
2169    let extracted = catch_unwind(AssertUnwindSafe(
2170        || -> Result<String, Box<dyn std::error::Error>> {
2171            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2172            extract_pdf_text_from_document(&mut document)
2173        },
2174    ));
2175    match extracted {
2176        Ok(Ok(text)) => {
2177            saw_success = true;
2178            if let Some(normalized) = normalize_pdf_text(text) {
2179                return (normalized, None);
2180            }
2181        }
2182        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
2183        Err(payload) => failures.push(format!(
2184            "from-bytes full-document panic: {}",
2185            panic_payload_to_string(payload.as_ref())
2186        )),
2187    }
2188
2189    if saw_success || is_non_actionable_pdf_failure(&failures) {
2190        (String::new(), None)
2191    } else {
2192        (
2193            String::new(),
2194            Some(format!(
2195                "PDF text extraction failed after {} attempts: {}",
2196                failures.len(),
2197                failures.join("; ")
2198            )),
2199        )
2200    }
2201}
2202
2203fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2204    !failures.is_empty()
2205        && failures.iter().all(|failure| {
2206            failure.contains("requires a password")
2207                || failure.contains("Encrypt dictionary missing /O")
2208                || failure.contains("Encrypt dictionary missing /U")
2209                || failure.contains("security handler cannot be found")
2210                || failure.contains("Invalid cross-reference table")
2211        })
2212}
2213
2214fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2215    if let Some(message) = payload.downcast_ref::<&str>() {
2216        (*message).to_string()
2217    } else if let Some(message) = payload.downcast_ref::<String>() {
2218        message.clone()
2219    } else {
2220        "unknown panic payload".to_string()
2221    }
2222}
2223
2224fn extract_first_pdf_page_text(
2225    document: &mut pdf_oxide::document::PdfDocument,
2226) -> Result<String, Box<dyn std::error::Error>> {
2227    if document.page_count()? == 0 {
2228        return Ok(String::new());
2229    }
2230
2231    let extracted_text = document.extract_text(0)?;
2232    let markdown_text =
2233        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2234    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2235        return Ok(extracted_text);
2236    }
2237
2238    let pipeline_text =
2239        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2240
2241    Ok(merge_pdf_first_page_text(
2242        &extracted_text,
2243        &markdown_text,
2244        &pipeline_text,
2245    ))
2246}
2247
2248fn extract_pdf_text_from_document(
2249    document: &mut pdf_oxide::document::PdfDocument,
2250) -> Result<String, Box<dyn std::error::Error>> {
2251    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2252}
2253
2254fn normalize_pdf_text(text: String) -> Option<String> {
2255    let normalized = text.replace(['\r', '\u{0c}'], "\n");
2256    (!normalized.trim().is_empty()).then_some(normalized)
2257}
2258
2259fn merge_pdf_first_page_text(
2260    _extracted_text: &str,
2261    markdown_text: &str,
2262    pipeline_text: &str,
2263) -> String {
2264    let pipeline = pipeline_text.trim();
2265    if pipeline.is_empty() {
2266        return String::new();
2267    }
2268
2269    let prefix = pdf_first_page_heading_prefix(markdown_text);
2270    let Some(prefix) = prefix else {
2271        return pipeline_text.to_string();
2272    };
2273
2274    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2275        pipeline_text.to_string()
2276    } else {
2277        format!("{prefix}\n\n{pipeline}")
2278    }
2279}
2280
2281fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2282    normalize_pdf_heading_comparison_text(text)
2283        .contains(&normalize_pdf_heading_comparison_text(prefix))
2284}
2285
2286fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2287    text.split_whitespace()
2288        .map(|part| part.to_ascii_lowercase())
2289        .collect::<Vec<_>>()
2290        .join(" ")
2291}
2292
2293fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2294    let mut lines = Vec::new();
2295
2296    for line in pdf_markdown_heading_lines(markdown_text) {
2297        push_unique_line(&mut lines, line);
2298    }
2299
2300    (!lines.is_empty()).then(|| lines.join("\n"))
2301}
2302
2303fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2304    text.lines()
2305        .map(str::trim)
2306        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2307        .map(|line| line.trim_matches('#').trim())
2308        .filter(|line| !line.is_empty())
2309        .filter(|line| !looks_like_numbered_section_heading(line))
2310        .take(4)
2311        .map(ToOwned::to_owned)
2312        .collect()
2313}
2314
2315fn push_unique_line(lines: &mut Vec<String>, line: String) {
2316    if !lines.iter().any(|existing| existing == &line) {
2317        lines.push(line);
2318    }
2319}
2320
2321fn looks_like_numbered_section_heading(line: &str) -> bool {
2322    let mut chars = line.chars();
2323    let Some(first) = chars.next() else {
2324        return false;
2325    };
2326
2327    if !first.is_ascii_digit() {
2328        return false;
2329    }
2330
2331    matches!(chars.next(), Some('.'))
2332}
2333
2334fn is_zip_archive(bytes: &[u8]) -> bool {
2335    bytes.starts_with(b"PK\x03\x04")
2336        || bytes.starts_with(b"PK\x05\x06")
2337        || bytes.starts_with(b"PK\x07\x08")
2338}
2339
2340pub fn extract_printable_strings(bytes: &[u8]) -> String {
2341    const MIN_LEN: usize = 4;
2342    const MIN_OUTPUT_BYTES: usize = 2_000_000;
2343    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2344
2345    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2346
2347    fn is_printable_ascii(b: u8) -> bool {
2348        matches!(b, 0x20..=0x7E)
2349    }
2350
2351    let mut out = String::new();
2352    let mut run: Vec<u8> = Vec::new();
2353
2354    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2355        if run.len() >= MIN_LEN {
2356            if !out.is_empty() {
2357                out.push('\n');
2358            }
2359            out.push_str(&String::from_utf8_lossy(run));
2360        }
2361        run.clear();
2362    };
2363
2364    for &b in bytes {
2365        if is_printable_ascii(b) {
2366            run.push(b);
2367        } else {
2368            flush_run(&mut out, &mut run);
2369            if out.len() >= max_output_bytes {
2370                return out;
2371            }
2372        }
2373    }
2374    flush_run(&mut out, &mut run);
2375    if out.len() >= max_output_bytes {
2376        return out;
2377    }
2378
2379    for start in 0..=1 {
2380        run.clear();
2381        let mut i = start;
2382        while i + 1 < bytes.len() {
2383            let b0 = bytes[i];
2384            let b1 = bytes[i + 1];
2385            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2386            if is_printable_ascii(ch) && zero == 0 {
2387                run.push(ch);
2388            } else {
2389                flush_run(&mut out, &mut run);
2390                if out.len() >= max_output_bytes {
2391                    return out;
2392                }
2393            }
2394            i += 2;
2395        }
2396        flush_run(&mut out, &mut run);
2397        if out.len() >= max_output_bytes {
2398            return out;
2399        }
2400    }
2401
2402    out
2403}
2404
2405#[cfg(test)]
2406mod tests {
2407    use image::ImageFormat;
2408    use std::path::Path;
2409
2410    use crate::copyright::detect_copyrights;
2411
2412    use super::{
2413        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, MAX_PDF_TEXT_EXTRACTION_BYTES,
2414        MAX_XMP_PACKET_BYTES, classify_file_info, extract_printable_strings,
2415        extract_raw_xmp_packet, extract_text_for_detection,
2416        extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2417        is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2418        values_to_text, windows_metadata_or_empty_result,
2419    };
2420
2421    fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2422        let mut out = Vec::new();
2423        out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2424        out.extend_from_slice(chunk_type);
2425        out.extend_from_slice(data);
2426        out.extend_from_slice(&0u32.to_be_bytes());
2427        out
2428    }
2429
2430    fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2431        let mut bytes = Vec::new();
2432        bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2433
2434        let ihdr = [
2435            0, 0, 0, 1, // width
2436            0, 0, 0, 1, // height
2437            8, // bit depth
2438            2, // color type
2439            0, // compression
2440            0, // filter
2441            0, // interlace
2442        ];
2443        bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2444
2445        let mut itxt = Vec::new();
2446        itxt.extend_from_slice(b"XML:com.adobe.xmp");
2447        itxt.push(0); // keyword terminator
2448        itxt.push(0); // compression flag
2449        itxt.push(0); // compression method
2450        itxt.push(0); // language tag terminator
2451        itxt.push(0); // translated keyword terminator
2452        itxt.extend_from_slice(xmp.as_bytes());
2453        bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2454
2455        bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2456        bytes
2457    }
2458
2459    #[test]
2460    fn test_extract_text_for_detection_skips_jar_archives() {
2461        let path = Path::new(
2462            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2463        );
2464        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2465
2466        let (text, kind) = extract_text_for_detection(path, &bytes);
2467
2468        assert!(text.is_empty());
2469        assert_eq!(kind, ExtractedTextKind::None);
2470    }
2471
2472    #[test]
2473    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2474        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2475        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2476
2477        let (text, kind) = extract_text_for_detection(path, &bytes);
2478
2479        assert_eq!(kind, ExtractedTextKind::Pdf);
2480        assert!(text.contains("Redistribution and use in source and binary forms"));
2481    }
2482
2483    #[test]
2484    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2485        let path =
2486            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2487        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2488
2489        let (text, kind) = extract_text_for_detection(path, &bytes);
2490
2491        assert_eq!(kind, ExtractedTextKind::Pdf);
2492        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2493        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2494    }
2495
2496    #[test]
2497    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2498        let path =
2499            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2500        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2501
2502        let (text, kind) = extract_text_for_detection(path, &bytes);
2503
2504        assert_eq!(kind, ExtractedTextKind::Pdf);
2505
2506        let normalized = normalize_pdf_heading_comparison_text(&text);
2507        let heading =
2508            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2509        assert_eq!(normalized.matches(&heading).count(), 1);
2510    }
2511
2512    #[test]
2513    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2514        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2515        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2516
2517        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2518
2519        assert_eq!(kind, ExtractedTextKind::Pdf);
2520        assert!(text.contains("Redistribution and use in source and binary forms"));
2521    }
2522
2523    #[test]
2524    fn test_extract_text_for_detection_skips_oversized_pdf_payload() {
2525        let mut bytes = b"%PDF-1.7\n".to_vec();
2526        bytes.resize(MAX_PDF_TEXT_EXTRACTION_BYTES + 1, b'0');
2527
2528        let (text, kind, scan_error) =
2529            extract_text_for_detection_with_diagnostics(Path::new("oversized.pdf"), &bytes);
2530
2531        assert!(text.is_empty());
2532        assert_eq!(kind, ExtractedTextKind::None);
2533        assert!(
2534            scan_error
2535                .as_deref()
2536                .is_some_and(|message| message.contains("PDF text extraction skipped"))
2537        );
2538    }
2539
2540    #[test]
2541    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2542        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2543
2544        let (text, kind, scan_error) =
2545            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2546
2547        assert!(text.is_empty());
2548        assert_eq!(kind, ExtractedTextKind::None);
2549        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2550        assert!(scan_error.contains("PDF text extraction failed after"));
2551    }
2552
2553    #[test]
2554    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2555        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2556
2557        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2558
2559        assert!(text.is_empty());
2560        assert_eq!(kind, ExtractedTextKind::None);
2561    }
2562
2563    #[test]
2564    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2565        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2566        let text = b"Copyright 2026 Example Project!!!";
2567        bytes[..text.len()].copy_from_slice(text);
2568        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2569        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2570
2571        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2572
2573        assert_ne!(kind, ExtractedTextKind::None);
2574        assert!(text.contains("Copyright 2026 Example Project"));
2575    }
2576
2577    #[test]
2578    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2579        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2580        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2581        bytes[..noise.len()].copy_from_slice(noise);
2582        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2583        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2584
2585        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2586
2587        assert!(text.is_empty());
2588        assert_eq!(kind, ExtractedTextKind::None);
2589    }
2590
2591    #[test]
2592    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2593        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2594        let bytes = std::fs::read(path).expect("read PE fixture");
2595
2596        let (text, kind) = extract_text_for_detection(path, &bytes);
2597
2598        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2599        assert!(text.contains("License: This program is free software"));
2600        assert!(text.contains("LegalCopyright:"));
2601    }
2602
2603    #[test]
2604    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2605    {
2606        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2607        let mut bytes = std::fs::read(path).expect("read PE fixture");
2608        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2609
2610        let (text, kind) = extract_text_for_detection(path, &bytes);
2611
2612        assert_ne!(kind, ExtractedTextKind::None);
2613        assert!(!text.trim().is_empty());
2614    }
2615
2616    #[test]
2617    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2618        let (text, kind, scan_error) =
2619            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2620
2621        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2622        assert_eq!(text, "LegalCopyright: Example Corp");
2623        assert!(scan_error.is_none());
2624    }
2625
2626    #[test]
2627    fn test_format_xmp_value_labels_creator_and_title_fields() {
2628        assert_eq!(
2629            format_xmp_value("creator", "Chinmay Garde"),
2630            "Author: Chinmay Garde"
2631        );
2632        assert_eq!(
2633            format_xmp_value("title", "Bay Bridge At Night"),
2634            "Title: Bay Bridge At Night"
2635        );
2636        assert_eq!(
2637            format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2638            "Description: Embarcadero in the evening on Delta 3200"
2639        );
2640    }
2641
2642    #[test]
2643    fn test_format_metadata_field_prefixes_exif_text() {
2644        assert_eq!(
2645            format_metadata_field("Author", "Chinmay Garde"),
2646            "Author: Chinmay Garde"
2647        );
2648        assert_eq!(
2649            format_metadata_field("Description", "Bay Bridge At Night"),
2650            "Description: Bay Bridge At Night"
2651        );
2652    }
2653
2654    #[test]
2655    fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2656        let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2657        let bytes = build_png_with_xmp(xmp);
2658
2659        let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2660
2661        assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2662        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2663        assert!(
2664            text.contains("Title: Bay Bridge At Night"),
2665            "text: {text:?}"
2666        );
2667        assert!(
2668            text.contains("Description: Embarcadero in the evening on Delta 3200"),
2669            "text: {text:?}"
2670        );
2671
2672        let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2673        assert_eq!(
2674            authors
2675                .iter()
2676                .map(|a| a.author.as_str())
2677                .collect::<Vec<_>>(),
2678            vec!["Chinmay Garde"],
2679            "authors: {authors:?}; text: {text:?}"
2680        );
2681    }
2682
2683    #[test]
2684    fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2685        let text = values_to_text(vec![
2686            "Author: Chinmay Garde".to_string(),
2687            "Copyright: Chinmay Garde".to_string(),
2688            "Title: Bay Bridge At Night".to_string(),
2689        ]);
2690
2691        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2692        assert!(
2693            text.contains("Title: Bay Bridge At Night"),
2694            "text: {text:?}"
2695        );
2696        assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2697    }
2698
2699    #[test]
2700    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2701        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2702        let text = b"Copyright 2026 Example Project!!!";
2703        bytes[..text.len()].copy_from_slice(text);
2704
2705        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2706
2707        assert!(text.is_empty());
2708        assert_eq!(kind, ExtractedTextKind::None);
2709    }
2710
2711    #[test]
2712    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2713        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2714        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2715        bytes[..text.len()].copy_from_slice(text);
2716
2717        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2718
2719        assert_ne!(kind, ExtractedTextKind::None);
2720        assert!(text.contains("asn@redhat.com"));
2721        assert!(text.contains("https://publicsuffix.org/"));
2722    }
2723
2724    #[test]
2725    fn test_extract_text_for_detection_keeps_large_macho_with_off_window_legal_markers() {
2726        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2727        bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
2728        let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
2729        let insert_offset = 200 * 1024;
2730        bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);
2731
2732        let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);
2733
2734        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2735        assert!(text.contains("Apache License, Version 2.0"), "{text}");
2736        assert!(
2737            text.contains("SPDX-License-Identifier: Apache-2.0"),
2738            "{text}"
2739        );
2740    }
2741
2742    #[test]
2743    fn test_extract_text_for_detection_keeps_large_macho_with_unicode_notice_markers() {
2744        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2745        bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
2746        let unicode_notice = b"Copyright (c) 1991-2024 Unicode, Inc.\nFor terms of use, see http://www.unicode.org/copyright.html\n";
2747        let insert_offset = 700 * 1024;
2748        bytes[insert_offset..insert_offset + unicode_notice.len()].copy_from_slice(unicode_notice);
2749
2750        let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);
2751
2752        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2753        assert!(text.contains("Unicode, Inc."), "{text}");
2754        assert!(text.contains("unicode.org/copyright.html"), "{text}");
2755    }
2756
2757    #[test]
2758    fn test_extract_text_for_detection_does_not_reopen_single_window_legal_noise_for_non_macho() {
2759        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
2760        let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
2761        let insert_offset = 200 * 1024;
2762        bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);
2763
2764        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2765
2766        assert!(text.is_empty());
2767        assert_eq!(kind, ExtractedTextKind::None);
2768    }
2769
2770    #[test]
2771    fn test_extract_text_for_detection_avoids_latin1_decode_for_binary_blob_noise() {
2772        let bytes = vec![
2773            0x28, 0x63, 0x29, 0x20, 0x4b, 0x30, 0x0e, 0x71, 0x86, 0x20, 0x62, 0x24, 0x4c,
2774        ];
2775
2776        let (text, kind) = extract_text_for_detection(Path::new("fixture.blb"), &bytes);
2777
2778        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2779        assert_eq!(text, "(c) K0\n b$L");
2780    }
2781
2782    #[test]
2783    fn test_extract_raw_xmp_packet_rejects_oversized_png_itxt_payload() {
2784        let xmp = "A".repeat(MAX_XMP_PACKET_BYTES + 1);
2785        let bytes = build_png_with_xmp(&xmp);
2786
2787        assert!(extract_raw_xmp_packet(&bytes, ImageFormat::Png).is_none());
2788    }
2789
2790    #[test]
2791    fn test_non_actionable_pdf_failures_are_suppressed() {
2792        assert!(is_non_actionable_pdf_failure(&[
2793            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2794            "open full-document: PDF is encrypted and requires a password".to_string(),
2795        ]));
2796        assert!(is_non_actionable_pdf_failure(&[
2797            "from-bytes first-page: Invalid cross-reference table".to_string(),
2798            "open full-document: Invalid cross-reference table".to_string(),
2799        ]));
2800        assert!(is_non_actionable_pdf_failure(&[
2801            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2802            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2803        ]));
2804        assert!(!is_non_actionable_pdf_failure(&[
2805            "from-bytes first-page: some other parser failure".to_string(),
2806        ]));
2807    }
2808
2809    #[test]
2810    fn test_extract_text_for_detection_skips_zip_like_archives() {
2811        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2812
2813        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2814        let (crate_text, crate_kind) =
2815            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2816
2817        assert!(whl_text.is_empty());
2818        assert_eq!(whl_kind, ExtractedTextKind::None);
2819        assert!(crate_text.is_empty());
2820        assert_eq!(crate_kind, ExtractedTextKind::None);
2821    }
2822
2823    #[test]
2824    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2825        let path =
2826            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2827        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2828
2829        let (text, kind) = extract_text_for_detection(path, &bytes);
2830
2831        assert_ne!(kind, ExtractedTextKind::None);
2832        assert!(text.contains("Copyright nexB and others (c) 2012"));
2833    }
2834
2835    #[test]
2836    fn test_extract_text_for_detection_reads_font_metadata() {
2837        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2838        let bytes = std::fs::read(path).expect("failed to read font fixture");
2839
2840        let (text, kind) = extract_text_for_detection(path, &bytes);
2841
2842        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2843        assert!(text.contains("License Description:"), "{text}");
2844        assert!(
2845            text.contains("Open Font License") || text.contains("OFL"),
2846            "{text}"
2847        );
2848        assert!(text.contains("Lato"), "{text}");
2849    }
2850
2851    #[test]
2852    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2853        let bytes = b"abcd\0".repeat(525_000);
2854
2855        let text = extract_printable_strings(&bytes);
2856
2857        assert!(
2858            text.len() > 2_000_000,
2859            "unexpected truncation at {}",
2860            text.len()
2861        );
2862        assert!(text.ends_with("abcd"));
2863    }
2864
2865    #[test]
2866    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2867        let path = Path::new(
2868            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2869        );
2870        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2871
2872        let (text, kind) = extract_text_for_detection(path, &bytes);
2873
2874        assert_eq!(kind, ExtractedTextKind::Decoded);
2875        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2876    }
2877
2878    #[test]
2879    fn test_extract_text_for_detection_preserves_blank_comment_lines_in_utf8_source() {
2880        let path = Path::new("testdata/plugin_email_url/files/IMarkerActionFilter.java");
2881        let bytes = std::fs::read(path).expect("failed to read java fixture");
2882
2883        let (text, kind) = extract_text_for_detection(path, &bytes);
2884
2885        assert_eq!(kind, ExtractedTextKind::Decoded);
2886        let lines: Vec<_> = text.lines().collect();
2887        assert_eq!(lines.get(2).copied(), Some(" *"));
2888        assert_eq!(
2889            lines.get(3).copied(),
2890            Some(" *https://github.com/rpm-software-management")
2891        );
2892        assert_eq!(lines.get(5).copied(), Some("https://gitlab.com/Conan_Kudo"));
2893    }
2894
2895    #[test]
2896    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2897        let path = Path::new(
2898            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2899        );
2900        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2901
2902        let (text, kind) = extract_text_for_detection(path, &bytes);
2903
2904        assert_eq!(kind, ExtractedTextKind::Decoded);
2905        assert!(text.contains("GNU Lesser General Public"));
2906        assert!(text.contains("version"));
2907        assert!(text.contains("2.1 of the License"));
2908    }
2909
2910    #[test]
2911    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2912        assert_eq!(
2913            normalize_mime_type(
2914                Path::new("main.ts"),
2915                b"export const answer = 42;\n",
2916                Some("TypeScript"),
2917                "video/mp2t",
2918            ),
2919            "text/plain"
2920        );
2921    }
2922
2923    #[test]
2924    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2925        assert_eq!(
2926            normalize_mime_type(
2927                Path::new("main.js"),
2928                b"console.log('hello');\n",
2929                Some("JavaScript"),
2930                "application/octet-stream",
2931            ),
2932            "text/plain"
2933        );
2934    }
2935
2936    #[test]
2937    fn test_normalize_mime_type_preserves_binary_video_guess() {
2938        assert_eq!(
2939            normalize_mime_type(
2940                Path::new("main.ts"),
2941                &[0, 159, 146, 150, 0, 1, 2, 3],
2942                Some("TypeScript"),
2943                "video/mp2t",
2944            ),
2945            "video/mp2t"
2946        );
2947    }
2948
2949    #[test]
2950    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2951        assert_eq!(
2952            normalize_mime_type(
2953                Path::new("main.ts"),
2954                &[0, 159, 146, 150],
2955                Some("TypeScript"),
2956                "application/octet-stream",
2957            ),
2958            "application/octet-stream"
2959        );
2960    }
2961
2962    #[test]
2963    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2964        let classification = classify_file_info(Path::new("test.txt"), b"");
2965
2966        assert_eq!(classification.mime_type, "inode/x-empty");
2967        assert_eq!(classification.file_type, "empty");
2968        assert!(!classification.is_binary);
2969        assert!(classification.is_text);
2970        assert!(!classification.is_source);
2971        assert_eq!(classification.programming_language, None);
2972    }
2973
2974    #[test]
2975    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2976        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2977
2978        assert_eq!(classification.mime_type, "application/json");
2979        assert_eq!(classification.file_type, "JSON text data");
2980        assert!(classification.is_text);
2981        assert!(!classification.is_source);
2982        assert_eq!(classification.programming_language, None);
2983    }
2984
2985    #[test]
2986    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2987        let classification =
2988            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2989
2990        assert_eq!(classification.mime_type, "text/plain");
2991        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2992        assert!(classification.is_text);
2993        assert!(!classification.is_binary);
2994    }
2995
2996    #[test]
2997    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2998        let classification =
2999            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
3000
3001        assert_eq!(classification.mime_type, "application/octet-stream");
3002        assert_eq!(classification.file_type, "data");
3003        assert!(classification.is_binary);
3004        assert!(!classification.is_text);
3005    }
3006
3007    #[test]
3008    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
3009        let classification = classify_file_info(
3010            Path::new("utf16.json"),
3011            &[
3012                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
3013            ],
3014        );
3015
3016        assert!(!classification.is_binary);
3017        assert!(classification.is_text);
3018        assert_eq!(classification.mime_type, "application/json");
3019        assert_eq!(classification.file_type, "JSON text data");
3020    }
3021
3022    #[test]
3023    fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
3024        let classification = classify_file_info(
3025            Path::new("utf16be.json"),
3026            &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
3027        );
3028
3029        assert!(!classification.is_binary);
3030        assert!(classification.is_text);
3031        assert_eq!(classification.mime_type, "application/json");
3032        assert_eq!(classification.file_type, "JSON text data");
3033    }
3034
3035    #[test]
3036    fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
3037        let classification =
3038            classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
3039
3040        assert!(!classification.is_binary);
3041        assert!(classification.is_text);
3042        assert_eq!(classification.mime_type, "application/json");
3043        assert_eq!(classification.file_type, "JSON text data");
3044    }
3045
3046    #[test]
3047    fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
3048        let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
3049        for code_unit in
3050            "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
3051        {
3052            bytes.extend_from_slice(&code_unit.to_be_bytes());
3053        }
3054
3055        let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
3056
3057        assert_eq!(kind, ExtractedTextKind::Decoded);
3058        assert!(text.contains("Apache Software Foundation"), "{text}");
3059        assert!(text.contains("Apache License, Version 2.0"), "{text}");
3060    }
3061
3062    #[test]
3063    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
3064        let classification = classify_file_info(Path::new("true.json"), b"true");
3065
3066        assert!(!classification.is_binary);
3067        assert!(classification.is_text);
3068        assert_eq!(classification.mime_type, "application/json");
3069        assert_eq!(classification.file_type, "JSON text data");
3070    }
3071
3072    #[test]
3073    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
3074        let classification = classify_file_info(
3075            Path::new("wrapped.json"),
3076            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
3077        );
3078
3079        assert!(!classification.is_binary);
3080        assert!(classification.is_text);
3081        assert_eq!(classification.mime_type, "text/plain");
3082        assert_eq!(classification.file_type, "text, with no line terminators");
3083    }
3084
3085    #[test]
3086    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
3087        let classification =
3088            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
3089
3090        assert!(classification.is_binary);
3091        assert!(!classification.is_text);
3092        assert_eq!(classification.mime_type, "application/octet-stream");
3093        assert_eq!(classification.file_type, "data");
3094    }
3095
3096    #[test]
3097    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
3098        let classification = classify_file_info(
3099            Path::new("crash.json"),
3100            &[
3101                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
3102            ],
3103        );
3104
3105        assert!(classification.is_binary);
3106        assert!(!classification.is_text);
3107        assert_eq!(classification.mime_type, "application/octet-stream");
3108    }
3109
3110    #[test]
3111    fn test_classify_file_info_treats_dockerfile_as_source() {
3112        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
3113
3114        assert_eq!(
3115            classification.programming_language.as_deref(),
3116            Some("Dockerfile")
3117        );
3118        assert!(classification.is_source);
3119        assert!(!classification.is_script);
3120        assert_eq!(
3121            classification.file_type,
3122            "Dockerfile source, UTF-8 Unicode text"
3123        );
3124    }
3125
3126    #[test]
3127    fn test_classify_file_info_treats_makefile_as_text_not_source() {
3128        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
3129
3130        assert_eq!(classification.programming_language, None);
3131        assert!(classification.is_text);
3132        assert!(!classification.is_source);
3133        assert!(!classification.is_script);
3134        assert_eq!(classification.file_type, "UTF-8 Unicode text");
3135    }
3136
3137    #[test]
3138    fn test_classify_file_info_marks_supported_package_archives() {
3139        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
3140
3141        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
3142        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
3143
3144        assert!(egg.is_archive);
3145        assert_eq!(egg.mime_type, "application/zip");
3146        assert_eq!(egg.file_type, "Zip archive data");
3147        assert!(nupkg.is_archive);
3148        assert_eq!(nupkg.mime_type, "application/zip");
3149        assert_eq!(nupkg.file_type, "Zip archive data");
3150    }
3151
3152    #[test]
3153    fn test_classify_file_info_marks_png_as_binary_media() {
3154        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
3155
3156        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
3157
3158        assert_eq!(classification.mime_type, "image/png");
3159        assert_eq!(classification.file_type, "PNG image data");
3160        assert!(classification.is_binary);
3161        assert!(!classification.is_text);
3162        assert!(classification.is_media);
3163        assert!(!classification.is_archive);
3164        assert!(!classification.is_source);
3165    }
3166
3167    #[test]
3168    fn test_classify_file_info_marks_pdf_as_binary_document() {
3169        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
3170
3171        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
3172
3173        assert_eq!(classification.mime_type, "application/pdf");
3174        assert_eq!(classification.file_type, "PDF document");
3175        assert!(classification.is_binary);
3176        assert!(!classification.is_text);
3177        assert!(!classification.is_archive);
3178        assert!(!classification.is_media);
3179    }
3180
3181    #[test]
3182    fn test_classify_file_info_marks_binary_blobs_as_binary() {
3183        let classification =
3184            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
3185
3186        assert!(classification.is_binary);
3187        assert!(!classification.is_text);
3188        assert!(!classification.is_source);
3189        assert_eq!(classification.programming_language, None);
3190    }
3191
3192    #[test]
3193    fn test_classify_file_info_treats_yaml_as_text_not_source() {
3194        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
3195
3196        assert_eq!(classification.programming_language, None);
3197        assert!(classification.is_text);
3198        assert!(!classification.is_source);
3199        assert_eq!(classification.file_type, "YAML text data");
3200    }
3201
3202    #[test]
3203    fn test_classify_file_info_classifies_common_build_manifests() {
3204        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
3205        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
3206        let cmake = classify_file_info(
3207            Path::new("toolchain.cmake"),
3208            b"set(CMAKE_CXX_STANDARD 20)\n",
3209        );
3210        let gitmodules = classify_file_info(
3211            Path::new(".gitmodules"),
3212            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
3213        );
3214
3215        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
3216        assert!(gradle.is_source);
3217        assert_eq!(gradle.mime_type, "text/plain");
3218        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
3219
3220        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
3221        assert!(flake.is_source);
3222        assert_eq!(flake.mime_type, "text/plain");
3223        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
3224
3225        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
3226        assert!(cmake.is_source);
3227        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
3228
3229        assert_eq!(gitmodules.programming_language, None);
3230        assert!(gitmodules.is_text);
3231        assert!(!gitmodules.is_source);
3232        assert_eq!(gitmodules.file_type, "Git configuration text");
3233    }
3234
3235    #[test]
3236    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
3237        let header = classify_file_info(
3238            Path::new("include/demo.hpp"),
3239            b"#pragma once\nclass Demo {};\n",
3240        );
3241        let ipp = classify_file_info(
3242            Path::new("include/detail/demo.ipp"),
3243            b"template <class T> void parse() {}\n",
3244        );
3245
3246        assert_eq!(header.programming_language.as_deref(), Some("C++"));
3247        assert!(header.is_source);
3248        assert!(!header.is_script);
3249        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
3250
3251        assert_eq!(ipp.programming_language, None);
3252        assert!(!ipp.is_source);
3253        assert!(!ipp.is_script);
3254        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
3255    }
3256
3257    #[test]
3258    fn test_classify_file_info_preserves_specific_shell_family_labels() {
3259        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
3260
3261        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
3262        assert!(bash.is_script);
3263        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
3264    }
3265
3266    #[test]
3267    fn test_classify_file_info_marks_jamfile_as_source() {
3268        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
3269
3270        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
3271        assert!(jamfile.is_source);
3272        assert!(!jamfile.is_script);
3273        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
3274    }
3275
3276    #[test]
3277    fn test_classify_file_info_labels_javascript_shebang_scripts() {
3278        let classification = classify_file_info(
3279            Path::new("bin/run"),
3280            b"#!/usr/bin/env node\nconsole.log('hello');\n",
3281        );
3282
3283        assert_eq!(
3284            classification.programming_language.as_deref(),
3285            Some("JavaScript")
3286        );
3287        assert!(classification.is_script);
3288        assert_eq!(
3289            classification.file_type,
3290            "javascript script, UTF-8 Unicode text executable"
3291        );
3292    }
3293
3294    #[test]
3295    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
3296        let classification = classify_file_info(
3297            Path::new("script.py"),
3298            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3299        );
3300
3301        assert_eq!(
3302            classification.programming_language.as_deref(),
3303            Some("Python")
3304        );
3305        assert!(classification.is_script);
3306        assert_eq!(classification.file_type, "python script, text executable");
3307    }
3308
3309    #[test]
3310    fn test_classify_file_info_treats_textual_tga_as_media() {
3311        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3312
3313        assert!(classification.is_media);
3314        assert!(classification.is_text);
3315        assert!(!classification.is_binary);
3316    }
3317
3318    #[test]
3319    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3320        let classification =
3321            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3322
3323        assert!(classification.is_binary);
3324        assert!(!classification.is_text);
3325        assert!(!classification.is_source);
3326        assert_eq!(classification.programming_language, None);
3327    }
3328
3329    #[test]
3330    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3331        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3332
3333        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3334
3335        assert!(text.is_empty());
3336        assert_eq!(kind, ExtractedTextKind::None);
3337    }
3338
3339    #[test]
3340    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3341        let cases = [
3342            (
3343                Path::new("bin/run"),
3344                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3345                Some("JavaScript"),
3346                true,
3347                true,
3348            ),
3349            (
3350                Path::new("Dockerfile"),
3351                b"FROM scratch\n".as_slice(),
3352                Some("Dockerfile"),
3353                true,
3354                false,
3355            ),
3356            (
3357                Path::new("package.json"),
3358                br#"{"name":"demo"}"#.as_slice(),
3359                None,
3360                false,
3361                false,
3362            ),
3363            (
3364                Path::new("config.yaml"),
3365                b"key: value\n".as_slice(),
3366                None,
3367                false,
3368                false,
3369            ),
3370            (
3371                Path::new("Makefile"),
3372                b"all:\n\techo hi\n".as_slice(),
3373                None,
3374                false,
3375                false,
3376            ),
3377        ];
3378
3379        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3380            let classification = classify_file_info(path, bytes);
3381
3382            assert_eq!(
3383                classification.programming_language.as_deref(),
3384                expected_language,
3385                "unexpected language for {}",
3386                path.display()
3387            );
3388            assert_eq!(
3389                classification.is_source,
3390                expected_is_source,
3391                "unexpected is_source for {}",
3392                path.display()
3393            );
3394            assert_eq!(
3395                classification.is_script,
3396                expected_is_script,
3397                "unexpected is_script for {}",
3398                path.display()
3399            );
3400        }
3401    }
3402}