Skip to main content

provenant/utils/
file.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26    None,
27    Decoded,
28    FontMetadata,
29    Pdf,
30    BinaryStrings,
31    ImageMetadata,
32    WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37    pub mime_type: String,
38    pub file_type: String,
39    pub programming_language: Option<String>,
40    pub is_binary: bool,
41    pub is_text: bool,
42    pub is_archive: bool,
43    pub is_media: bool,
44    pub is_source: bool,
45    pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const MAX_XMP_PACKET_BYTES: usize = 256 * 1024;
54const MAX_PDF_TEXT_EXTRACTION_BYTES: usize = 32 * 1024 * 1024;
55const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
56    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
57];
58const BINARY_EXTENSIONS: &[&str] = &[
59    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
60];
61const ARCHIVE_EXTENSIONS: &[&str] = &[
62    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
63    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
64];
65
66/// Get the last modified date of a file as a `YYYY-MM-DD` string.
67pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
68    metadata.modified().ok().map(|time: std::time::SystemTime| {
69        let seconds_since_epoch = time
70            .duration_since(std::time::UNIX_EPOCH)
71            .unwrap()
72            .as_secs() as i64;
73
74        Utc.timestamp_opt(seconds_since_epoch, 0)
75            .single()
76            .unwrap_or_else(Utc::now)
77            .format("%Y-%m-%d")
78            .to_string()
79    })
80}
81
82/// Check if a path should be excluded based on a list of glob patterns.
83pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
84    let path_str = path.to_string_lossy();
85    let file_name = path
86        .file_name()
87        .map(|name| name.to_string_lossy())
88        .unwrap_or_default();
89
90    for pattern in exclude_patterns {
91        // Match against full path
92        if pattern.matches(&path_str) {
93            return true;
94        }
95
96        // Match against just the file/directory name
97        if pattern.matches(&file_name) {
98            return true;
99        }
100    }
101
102    false
103}
104
105/// Decode a byte buffer to a String, trying UTF-16 first when the byte shape
106/// strongly suggests it, then UTF-8, then Latin-1.
107///
108/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
109/// so it can decode any byte sequence. This matches Python ScanCode's use of
110/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
111pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
112    if let Some(decoded) = decode_utf16_text(bytes) {
113        return decoded;
114    }
115
116    match String::from_utf8(bytes.to_vec()) {
117        Ok(s) => s,
118        Err(e) => {
119            let bytes = e.into_bytes();
120            if has_binary_control_chars(&bytes) {
121                return String::new();
122            }
123            bytes.iter().map(|&b| b as char).collect()
124        }
125    }
126}
127
128pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
129    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
130    (text, kind)
131}
132
133pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
134    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
135        return Cow::Borrowed(text);
136    };
137    if !matches!(
138        extension.to_ascii_lowercase().as_str(),
139        "md" | "markdown" | "html" | "htm"
140    ) {
141        return Cow::Borrowed(text);
142    }
143
144    let mut hints = Vec::new();
145    let has_dual_license_notice = has_dual_license_notice_text(text);
146    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
147        hints.push("Creative Commons Attribution 4.0 International License".to_string());
148    }
149    if !has_dual_license_notice
150        && (text.contains("Apache License (Version 2.0)")
151            || text.contains("Apache License, Version 2.0"))
152    {
153        hints.push(
154            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
155                .to_string(),
156        );
157    }
158
159    if !has_dual_license_notice {
160        hints.extend(extract_shields_license_badge_hints(text));
161    }
162
163    if hints.is_empty() {
164        Cow::Borrowed(text)
165    } else {
166        let mut augmented =
167            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
168        augmented.push_str(text);
169        augmented.push_str("\n\n");
170        for (index, hint) in hints.into_iter().enumerate() {
171            if index > 0 {
172                augmented.push('\n');
173            }
174            augmented.push_str(&hint);
175        }
176        Cow::Owned(augmented)
177    }
178}
179
180fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
181    let mut hints = Vec::new();
182    let mut rest = text;
183    let needle = "img.shields.io/badge/license-";
184
185    while let Some(index) = rest.find(needle) {
186        let start = index + needle.len();
187        let suffix = &rest[start..];
188        let end = suffix
189            .find([')', ']', '"', '\'', ' ', '\n'])
190            .unwrap_or(suffix.len());
191        let badge = &suffix[..end];
192        let Some(badge) = badge.strip_suffix(".svg") else {
193            rest = &suffix[end..];
194            continue;
195        };
196
197        let mut segments: Vec<_> = badge
198            .split('-')
199            .filter(|segment| !segment.is_empty())
200            .collect();
201        if segments.len() < 2 {
202            rest = &suffix[end..];
203            continue;
204        }
205        segments.pop();
206        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
207        if !candidate.is_empty() {
208            hints.push(canonical_shields_license_hint(&candidate));
209        }
210
211        rest = &suffix[end..];
212    }
213
214    hints.sort();
215    hints.dedup();
216    hints
217}
218
219fn has_dual_license_notice_text(text: &str) -> bool {
220    let lower = text.to_ascii_lowercase();
221    (lower.contains("licensed under either of") && lower.contains("at your option"))
222        || lower.contains("dual-licensed under")
223        || lower.contains("dual licensed under")
224}
225
226fn canonical_shields_license_hint(candidate: &str) -> String {
227    match candidate.trim() {
228        "MIT" => "The MIT License".to_string(),
229        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
230        other => format!("{other} License"),
231    }
232}
233
234pub(crate) fn extract_text_for_detection_with_diagnostics(
235    path: &Path,
236    bytes: &[u8],
237) -> (String, ExtractedTextKind, Option<String>) {
238    let ext = path
239        .extension()
240        .and_then(|e| e.to_str())
241        .map(|s| s.to_ascii_lowercase());
242    let detected_format = detect_file_format(bytes);
243
244    if looks_like_rtf(bytes, ext.as_deref()) {
245        let text = extract_rtf_text(bytes);
246        return if text.trim().is_empty() {
247            (String::new(), ExtractedTextKind::None, None)
248        } else {
249            (text, ExtractedTextKind::Decoded, None)
250        };
251    }
252
253    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
254        let (text, scan_error) = extract_pdf_text(path, bytes);
255        return if text.is_empty() {
256            (String::new(), ExtractedTextKind::None, scan_error)
257        } else {
258            (text, ExtractedTextKind::Pdf, None)
259        };
260    }
261
262    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
263        let text = extract_image_metadata_text(bytes, format);
264        return if text.is_empty() {
265            if is_supported_image_container(bytes, format) {
266                (String::new(), ExtractedTextKind::None, None)
267            } else {
268                let decoded = decode_bytes_to_string(bytes);
269                if decoded.is_empty() {
270                    (String::new(), ExtractedTextKind::None, None)
271                } else {
272                    (decoded, ExtractedTextKind::Decoded, None)
273                }
274            }
275        } else {
276            (text, ExtractedTextKind::ImageMetadata, None)
277        };
278    }
279
280    if let Some(text) = extract_font_metadata_text(path, bytes) {
281        let strings = extract_printable_strings(bytes);
282        let combined = if strings.is_empty() {
283            text
284        } else {
285            combine_extracted_text_fragments(Some(text), strings)
286        };
287        return (combined, ExtractedTextKind::FontMetadata, None);
288    }
289
290    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
291    let large_opaque_binary = windows_executable_metadata_text.is_none()
292        && is_large_opaque_binary_candidate(bytes, detected_format);
293
294    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
295        return windows_metadata_or_empty_result(windows_executable_metadata_text);
296    }
297
298    if should_skip_binary_string_extraction(path, bytes, detected_format) {
299        return (String::new(), ExtractedTextKind::None, None);
300    }
301
302    let is_svg_text = lower_extension(path).as_deref() == Some("svg")
303        || detected_format.media_type() == "image/svg+xml";
304    let should_try_decoded_text = looks_like_textual_bytes(bytes) || is_svg_text;
305    let decoded_is_utf8 = std::str::from_utf8(bytes).is_ok();
306    let path_suggests_text = ext.as_deref().is_some_and(|extension| {
307        PLAIN_TEXT_EXTENSIONS.contains(&extension) || detect_language(path, bytes).is_some()
308    });
309
310    if !large_opaque_binary && should_try_decoded_text {
311        let decoded = decode_bytes_to_string(bytes);
312        if !decoded.is_empty()
313            && (is_svg_text
314                || decoded_is_utf8
315                || path_suggests_text
316                || looks_like_decoded_text(&decoded))
317        {
318            let combined =
319                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
320            return (combined, ExtractedTextKind::Decoded, None);
321        }
322    }
323
324    let text = if large_opaque_binary {
325        extract_sampled_printable_strings(bytes)
326    } else {
327        extract_printable_strings(bytes)
328    };
329    if text.is_empty() {
330        windows_metadata_or_empty_result(windows_executable_metadata_text)
331    } else {
332        (
333            combine_extracted_text_fragments(windows_executable_metadata_text, text),
334            ExtractedTextKind::BinaryStrings,
335            None,
336        )
337    }
338}
339
340fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
341    match prefix {
342        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
343        Some(prefix) if !prefix.is_empty() => prefix,
344        _ => suffix,
345    }
346}
347
348fn windows_metadata_or_empty_result(
349    windows_executable_metadata_text: Option<String>,
350) -> (String, ExtractedTextKind, Option<String>) {
351    if let Some(metadata_text) = windows_executable_metadata_text {
352        (
353            metadata_text,
354            ExtractedTextKind::WindowsExecutableMetadata,
355            None,
356        )
357    } else {
358        (String::new(), ExtractedTextKind::None, None)
359    }
360}
361
362pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
363    let detected_format = detect_file_format(bytes);
364    let detected_language = detect_language(path, bytes);
365    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
366    let is_text = !is_binary;
367    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
368    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
369    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
370    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
371    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
372    let programming_language = is_source.then(|| detected_language.clone()).flatten();
373    let file_type = detect_file_type(
374        path,
375        bytes,
376        detected_format,
377        &mime_type,
378        programming_language.as_deref(),
379        is_binary,
380        is_text,
381        is_archive,
382        is_media,
383        is_script,
384    );
385
386    FileInfoClassification {
387        mime_type,
388        file_type,
389        programming_language,
390        is_binary,
391        is_text,
392        is_archive,
393        is_media,
394        is_source,
395        is_script,
396    }
397}
398
399fn detect_file_format(bytes: &[u8]) -> FileFormat {
400    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
401}
402
403const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];
404
405fn is_utf8_text(bytes: &[u8]) -> bool {
406    std::str::from_utf8(bytes).is_ok()
407}
408
409fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
410    bytes
411        .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
412        .unwrap_or(bytes)
413}
414
415fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
416    if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
417        return None;
418    }
419
420    let code_units: Vec<u16> = bytes
421        .chunks_exact(2)
422        .map(|chunk| {
423            if is_le {
424                u16::from_le_bytes([chunk[0], chunk[1]])
425            } else {
426                u16::from_be_bytes([chunk[0], chunk[1]])
427            }
428        })
429        .collect();
430
431    let decoded = std::char::decode_utf16(code_units)
432        .collect::<Result<String, _>>()
433        .ok()?;
434
435    if !require_text_shape {
436        return (!decoded.contains('\0')).then_some(decoded);
437    }
438
439    if !looks_like_decoded_text(&decoded) {
440        return None;
441    }
442
443    Some(decoded)
444}
445
446fn looks_like_decoded_text(decoded: &str) -> bool {
447    if decoded
448        .chars()
449        .any(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
450    {
451        return false;
452    }
453
454    let visible = decoded
455        .chars()
456        .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
457        .count();
458    if visible < 3 || decoded.contains('\0') {
459        return false;
460    }
461
462    let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
463    let punctuation = decoded
464        .chars()
465        .filter(|ch| {
466            matches!(
467                ch,
468                '{' | '}'
469                    | '['
470                    | ']'
471                    | '<'
472                    | '>'
473                    | '('
474                    | ')'
475                    | ':'
476                    | ';'
477                    | ','
478                    | '"'
479                    | '\''
480                    | '/'
481                    | '='
482                    | '-'
483                    | '_'
484                    | '#'
485                    | '!'
486            )
487        })
488        .count();
489    let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();
490
491    let textish = alpha + punctuation + whitespace;
492    textish + (visible / 5) >= visible && (alpha > 0 || punctuation >= 2)
493}
494
495fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
496    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
497    if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
498        return None;
499    }
500
501    let pair_count = stripped.len() / 2;
502    let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
503    let odd_zero = stripped
504        .iter()
505        .skip(1)
506        .step_by(2)
507        .filter(|&&b| b == 0)
508        .count();
509
510    let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
511    let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;
512
513    match (looks_like_le, looks_like_be) {
514        (true, false) => Some(true),
515        (false, true) => Some(false),
516        (true, true) => Some(true),
517        (false, false) => None,
518    }
519}
520
521fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
522    if let Some(decoded) = decode_utf16_bom_text(bytes) {
523        return Some(decoded);
524    }
525
526    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
527    match detect_utf16_endianness(bytes) {
528        Some(true) => decode_utf16_units(stripped, true, true),
529        Some(false) => decode_utf16_units(stripped, false, true),
530        None => None,
531    }
532}
533
534fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
535    if bytes.len() >= 2 {
536        let (is_le, body) = match bytes {
537            [0xFF, 0xFE, rest @ ..] => (true, rest),
538            [0xFE, 0xFF, rest @ ..] => (false, rest),
539            _ => {
540                let stripped = strip_corrupted_utf16_bom_prefix(bytes);
541                return match detect_utf16_endianness(bytes) {
542                    Some(true) => decode_utf16_units(stripped, true, false),
543                    Some(false) => decode_utf16_units(stripped, false, false),
544                    None => None,
545                };
546            }
547        };
548
549        if body.is_empty() || !body.len().is_multiple_of(2) {
550            return None;
551        }
552
553        return decode_utf16_units(body, is_le, false);
554    }
555
556    None
557}
558
559fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
560    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
561        return None;
562    }
563
564    let (is_le, body) = match bytes {
565        [0xFF, 0xFE, rest @ ..] => (true, rest),
566        [0xFE, 0xFF, rest @ ..] => (false, rest),
567        _ => return None,
568    };
569
570    if body.is_empty() || body.len() % 2 != 0 {
571        return None;
572    }
573
574    decode_utf16_units(body, is_le, true)
575}
576
577fn has_binary_control_chars(bytes: &[u8]) -> bool {
578    let control_count = bytes
579        .iter()
580        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
581        .count();
582    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
583}
584
585fn has_decodable_text(bytes: &[u8]) -> bool {
586    bytes.is_empty()
587        || is_utf8_text(bytes)
588        || decode_utf16_text(bytes).is_some()
589        || !has_binary_control_chars(bytes)
590}
591
592fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
593    if bytes.is_empty() || is_utf8_text(bytes) {
594        return true;
595    }
596    if let Some(decoded) = decode_utf16_text(bytes) {
597        return decoded
598            .chars()
599            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
600    }
601
602    let printable_count = bytes
603        .iter()
604        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
605        .count();
606    printable_count * 2 >= bytes.len()
607}
608
609fn is_textual_media_type(media_type: &str) -> bool {
610    media_type.starts_with("text/")
611        || matches!(
612            media_type,
613            "application/json" | "application/xml" | "text/xml"
614        )
615        || media_type.ends_with("+json")
616        || media_type.ends_with("+xml")
617}
618
619fn is_textual_format(detected_format: FileFormat) -> bool {
620    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
621        || is_textual_media_type(detected_format.media_type())
622}
623
624fn is_known_binary_format(detected_format: FileFormat) -> bool {
625    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
626        && !is_textual_format(detected_format)
627}
628
629pub fn detect_mime_type(
630    path: &Path,
631    bytes: &[u8],
632    detected_format: FileFormat,
633    programming_language: Option<&str>,
634) -> String {
635    if bytes.is_empty() {
636        return "inode/x-empty".to_string();
637    }
638
639    if lower_extension(path).as_deref() == Some("json") {
640        if let Some(is_binary) = json_binary_override(bytes) {
641            if is_binary {
642                return "application/octet-stream".to_string();
643            }
644            if has_valid_json_text(bytes) {
645                return "application/json".to_string();
646            }
647            return "text/plain".to_string();
648        }
649        if has_valid_json_text(bytes) {
650            return "application/json".to_string();
651        }
652        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
653            return "text/plain".to_string();
654        }
655        return "application/octet-stream".to_string();
656    }
657
658    if is_zip_archive(bytes) {
659        return detect_zip_like_mime(path);
660    }
661
662    if looks_like_deb(bytes, path) {
663        return "application/vnd.debian.binary-package".to_string();
664    }
665
666    if looks_like_rpm(bytes, path) {
667        return "application/x-rpm".to_string();
668    }
669
670    let guessed_mime = from_path(path)
671        .first_or_octet_stream()
672        .essence_str()
673        .to_string();
674
675    let mime_type = match detected_format {
676        FileFormat::Empty => "inode/x-empty".to_string(),
677        FileFormat::PlainText => {
678            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
679                "text/plain".to_string()
680            } else {
681                guessed_mime.clone()
682            }
683        }
684        _ => {
685            let detected_mime = detected_format.media_type();
686            if detected_mime == "application/octet-stream"
687                && guessed_mime != "application/octet-stream"
688            {
689                guessed_mime.clone()
690            } else {
691                detected_mime.to_string()
692            }
693        }
694    };
695
696    normalize_mime_type(path, bytes, programming_language, &mime_type)
697}
698
699fn normalize_mime_type(
700    path: &Path,
701    bytes: &[u8],
702    programming_language: Option<&str>,
703    mime_type: &str,
704) -> String {
705    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
706        return "text/plain".to_string();
707    }
708
709    mime_type.to_string()
710}
711
712fn should_prefer_text_mime(
713    path: &Path,
714    bytes: &[u8],
715    programming_language: Option<&str>,
716    mime_type: &str,
717) -> bool {
718    has_decodable_text(bytes)
719        && looks_like_textual_bytes(bytes)
720        && is_textual_source_candidate(path, programming_language)
721        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
722}
723
724fn has_valid_json_text(bytes: &[u8]) -> bool {
725    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
726        return false;
727    }
728
729    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
730        || decode_utf16_json_text(bytes)
731            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
732            .is_some()
733}
734
735fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
736    !bytes.contains(&0)
737        && !bytes.contains(&0xFF)
738        && bytes.starts_with(b"[\"")
739        && bytes.ends_with(b"\"]")
740        && bytes.len() >= 8
741}
742
743fn json_binary_override(bytes: &[u8]) -> Option<bool> {
744    if has_valid_json_text(bytes) {
745        return Some(false);
746    }
747
748    if bytes.contains(&0) {
749        return Some(true);
750    }
751
752    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
753        return Some(true);
754    }
755
756    if is_wrapped_invalid_json_string_text(bytes) {
757        return Some(false);
758    }
759
760    None
761}
762
763fn detect_is_binary(
764    path: &Path,
765    bytes: &[u8],
766    detected_format: FileFormat,
767    programming_language: Option<&str>,
768) -> bool {
769    if lower_extension(path).as_deref() == Some("json")
770        && let Some(is_binary) = json_binary_override(bytes)
771    {
772        return is_binary;
773    }
774
775    if is_textual_format(detected_format) {
776        return false;
777    }
778
779    if lower_extension(path)
780        .as_deref()
781        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
782    {
783        return true;
784    }
785
786    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
787        return false;
788    }
789
790    has_binary_control_chars(bytes)
791        || is_known_binary_format(detected_format)
792        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
793            && !looks_like_textual_bytes(bytes))
794}
795
796fn should_treat_binary_bytes_as_text(
797    path: &Path,
798    bytes: &[u8],
799    programming_language: Option<&str>,
800) -> bool {
801    has_decodable_text(bytes)
802        && looks_like_textual_bytes(bytes)
803        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
804}
805
806fn detect_is_archive(
807    path: &Path,
808    bytes: &[u8],
809    mime_type: &str,
810    is_text: bool,
811    detected_format: FileFormat,
812) -> bool {
813    if is_text {
814        return false;
815    }
816
817    lower_extension(path)
818        .as_deref()
819        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
820        || matches!(
821            detected_format.kind(),
822            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
823        )
824        || is_zip_archive(bytes)
825        || looks_like_gzip(bytes)
826        || looks_like_bzip2(bytes)
827        || looks_like_xz(bytes)
828        || looks_like_deb(bytes, path)
829        || looks_like_rpm(bytes, path)
830        || looks_like_squashfs(bytes, path)
831        || mime_type.contains("zip")
832        || mime_type.contains("compressed")
833        || mime_type.contains("tar")
834        || mime_type.contains("x-rpm")
835        || mime_type.contains("debian")
836}
837
838fn detect_is_media(
839    path: &Path,
840    bytes: &[u8],
841    mime_type: &str,
842    detected_format: FileFormat,
843) -> bool {
844    media_mime_from_content(bytes).is_some()
845        || matches!(
846            detected_format.kind(),
847            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
848        )
849        || mime_type.starts_with("image/")
850        || mime_type.starts_with("audio/")
851        || mime_type.starts_with("video/")
852        || (mime_type == "application/octet-stream"
853            && lower_extension(path).as_deref() == Some("tga")
854            && !has_binary_control_chars(bytes))
855}
856
857fn detect_is_script(
858    path: &Path,
859    bytes: &[u8],
860    programming_language: Option<&str>,
861    is_text: bool,
862) -> bool {
863    if !is_text || is_makefile(path) {
864        return false;
865    }
866
867    bytes.starts_with(b"#!")
868        || lower_extension(path).as_deref().is_some_and(|ext| {
869            matches!(
870                ext,
871                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
872            )
873        })
874        || matches!(
875            programming_language,
876            Some(
877                "Shell"
878                    | "Bash"
879                    | "Zsh"
880                    | "Fish"
881                    | "Ksh"
882                    | "Python"
883                    | "Ruby"
884                    | "Perl"
885                    | "PHP"
886                    | "PowerShell"
887                    | "Awk"
888            )
889        )
890}
891
892fn detect_is_source(
893    path: &Path,
894    programming_language: Option<&str>,
895    is_text: bool,
896    is_script: bool,
897) -> bool {
898    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
899        return false;
900    }
901
902    if is_c_like_source(path) || is_java_like_source(path) {
903        return true;
904    }
905
906    programming_language.is_some() || is_script
907}
908
909#[allow(clippy::too_many_arguments)]
910fn detect_file_type(
911    path: &Path,
912    bytes: &[u8],
913    detected_format: FileFormat,
914    mime_type: &str,
915    programming_language: Option<&str>,
916    is_binary: bool,
917    is_text: bool,
918    is_archive: bool,
919    is_media: bool,
920    is_script: bool,
921) -> String {
922    if bytes.is_empty() {
923        return "empty".to_string();
924    }
925
926    if looks_like_pdf(bytes) {
927        return "PDF document".to_string();
928    }
929
930    if let Some(file_type) = media_file_type_from_content(bytes) {
931        return file_type.to_string();
932    }
933
934    if is_archive {
935        return archive_file_type(path, bytes, detected_format);
936    }
937
938    if is_script {
939        return script_file_type(programming_language, bytes);
940    }
941
942    if is_text {
943        if lower_extension(path).as_deref() == Some("json") {
944            if has_valid_json_text(bytes) {
945                return "JSON text data".to_string();
946            }
947            return text_file_type(bytes);
948        }
949        if lower_extension(path).as_deref() == Some("xml") {
950            return "XML text data".to_string();
951        }
952        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
953            return "YAML text data".to_string();
954        }
955        if lower_extension(path).as_deref() == Some("toml") {
956            return "TOML text data".to_string();
957        }
958        if matches!(
959            lower_extension(path).as_deref(),
960            Some("ini" | "cfg" | "conf")
961        ) {
962            return "INI text data".to_string();
963        }
964        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
965            return "Git configuration text".to_string();
966        }
967        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
968            return text_file_type(bytes);
969        }
970        if programming_language.is_some() && !is_media {
971            return source_file_type(programming_language, bytes);
972        }
973        return text_file_type(bytes);
974    }
975
976    if let Some(file_type) = format_based_file_type(detected_format) {
977        return file_type;
978    }
979
980    if is_binary && mime_type == "application/octet-stream" {
981        return "data".to_string();
982    }
983
984    mime_type.to_string()
985}
986
987fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
988    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
989        return true;
990    }
991
992    if matches!(
993        lower_file_name(path).as_str(),
994        "dockerfile"
995            | "containerfile"
996            | "containerfile.core"
997            | "apkbuild"
998            | "podfile"
999            | "jamfile"
1000            | "jamroot"
1001            | "meson.build"
1002            | "build"
1003            | "workspace"
1004            | "buck"
1005            | "default.nix"
1006            | "flake.nix"
1007            | "shell.nix"
1008    ) {
1009        return true;
1010    }
1011
1012    path.extension()
1013        .and_then(|ext| ext.to_str())
1014        .is_some_and(|ext| {
1015            matches!(
1016                ext.to_ascii_lowercase().as_str(),
1017                "rs" | "py"
1018                    | "js"
1019                    | "mjs"
1020                    | "cjs"
1021                    | "jsx"
1022                    | "ts"
1023                    | "mts"
1024                    | "cts"
1025                    | "tsx"
1026                    | "c"
1027                    | "cpp"
1028                    | "cc"
1029                    | "cxx"
1030                    | "h"
1031                    | "hpp"
1032                    | "m"
1033                    | "mm"
1034                    | "s"
1035                    | "asm"
1036                    | "java"
1037                    | "go"
1038                    | "rb"
1039                    | "php"
1040                    | "pl"
1041                    | "swift"
1042                    | "sh"
1043                    | "bash"
1044                    | "zsh"
1045                    | "fish"
1046                    | "ksh"
1047                    | "ps1"
1048                    | "psm1"
1049                    | "psd1"
1050                    | "awk"
1051                    | "kt"
1052                    | "kts"
1053                    | "dart"
1054                    | "scala"
1055                    | "groovy"
1056                    | "gradle"
1057                    | "gvy"
1058                    | "gy"
1059                    | "gsh"
1060                    | "cs"
1061                    | "fs"
1062                    | "fsx"
1063                    | "r"
1064                    | "lua"
1065                    | "jl"
1066                    | "ex"
1067                    | "exs"
1068                    | "clj"
1069                    | "cljs"
1070                    | "cljc"
1071                    | "hs"
1072                    | "erl"
1073                    | "nix"
1074                    | "zig"
1075                    | "bzl"
1076                    | "bazel"
1077                    | "star"
1078                    | "sky"
1079                    | "ml"
1080                    | "mli"
1081                    | "tex"
1082            )
1083        })
1084}
1085
1086fn is_source_like_language(language: &str) -> bool {
1087    matches!(
1088        language,
1089        "Rust"
1090            | "Python"
1091            | "JavaScript"
1092            | "TypeScript"
1093            | "JavaScript/TypeScript"
1094            | "C"
1095            | "C++"
1096            | "Objective-C"
1097            | "Objective-C++"
1098            | "GAS"
1099            | "Java"
1100            | "Go"
1101            | "Ruby"
1102            | "PHP"
1103            | "Perl"
1104            | "Swift"
1105            | "Shell"
1106            | "PowerShell"
1107            | "Awk"
1108            | "Kotlin"
1109            | "Dart"
1110            | "Scala"
1111            | "C#"
1112            | "F#"
1113            | "R"
1114            | "Lua"
1115            | "Julia"
1116            | "Elixir"
1117            | "Clojure"
1118            | "Haskell"
1119            | "Erlang"
1120            | "Groovy"
1121            | "Nix"
1122            | "Zig"
1123            | "Starlark"
1124            | "OCaml"
1125            | "Meson"
1126            | "TeX"
1127            | "Dockerfile"
1128            | "Makefile"
1129            | "Jamfile"
1130    )
1131}
1132
1133fn extension(path: &Path) -> Option<&str> {
1134    path.extension().and_then(|ext| ext.to_str())
1135}
1136
1137fn lower_extension(path: &Path) -> Option<String> {
1138    extension(path).map(|ext| ext.to_ascii_lowercase())
1139}
1140
1141fn lower_file_name(path: &Path) -> String {
1142    path.file_name()
1143        .and_then(|name| name.to_str())
1144        .map(|name| name.to_ascii_lowercase())
1145        .unwrap_or_default()
1146}
1147
1148fn is_plain_text(path: &Path) -> bool {
1149    lower_extension(path)
1150        .as_deref()
1151        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
1152}
1153
1154fn is_makefile(path: &Path) -> bool {
1155    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
1156}
1157
1158fn is_source_map(path: &Path) -> bool {
1159    let path_lower = path.to_string_lossy().to_ascii_lowercase();
1160    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
1161}
1162
1163fn is_c_like_source(path: &Path) -> bool {
1164    lower_extension(path).as_deref().is_some_and(|ext| {
1165        matches!(
1166            ext,
1167            "c" | "cc"
1168                | "cp"
1169                | "cpp"
1170                | "cxx"
1171                | "c++"
1172                | "h"
1173                | "hh"
1174                | "hpp"
1175                | "hxx"
1176                | "h++"
1177                | "i"
1178                | "ii"
1179                | "m"
1180                | "s"
1181                | "asm"
1182        )
1183    })
1184}
1185
1186fn is_java_like_source(path: &Path) -> bool {
1187    lower_extension(path)
1188        .as_deref()
1189        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1190}
1191
1192fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1193    match detected_format {
1194        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1195        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1196        format => Some(match format.kind() {
1197            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1198            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1199            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1200            _ => format.name().to_string(),
1201        }),
1202    }
1203}
1204
1205fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1206    format
1207        .short_name()
1208        .map(|short_name| format!("{short_name} {suffix}"))
1209        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1210}
1211
1212fn detect_zip_like_mime(path: &Path) -> String {
1213    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1214        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1215        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1216            "application/java-archive".to_string()
1217        }
1218        _ => "application/zip".to_string(),
1219    }
1220}
1221
1222fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1223    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1224        Some("image/png")
1225    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1226        Some("image/jpeg")
1227    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1228        Some("image/tiff")
1229    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1230        Some("image/webp")
1231    } else {
1232        None
1233    }
1234}
1235
1236fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1237    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1238        Some("PNG image data")
1239    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1240        Some("JPEG image data")
1241    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1242        Some("TIFF image data")
1243    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1244        Some("WebP image data")
1245    } else {
1246        None
1247    }
1248}
1249
1250fn looks_like_pdf(bytes: &[u8]) -> bool {
1251    bytes.starts_with(b"%PDF-")
1252}
1253
1254fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1255    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1256}
1257
1258fn extract_rtf_text(bytes: &[u8]) -> String {
1259    let text = String::from_utf8_lossy(bytes);
1260    let chars: Vec<char> = text.chars().collect();
1261    let mut output = String::new();
1262    let mut index = 0usize;
1263
1264    while index < chars.len() {
1265        match chars[index] {
1266            '{' | '}' => {
1267                index += 1;
1268            }
1269            '\\' => {
1270                index += 1;
1271                if index >= chars.len() {
1272                    break;
1273                }
1274
1275                match chars[index] {
1276                    '\\' | '{' | '}' => {
1277                        output.push(chars[index]);
1278                        index += 1;
1279                    }
1280                    '\'' => {
1281                        if index + 2 < chars.len() {
1282                            let hex = [chars[index + 1], chars[index + 2]];
1283                            let hex: String = hex.iter().collect();
1284                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1285                                output.push(value as char);
1286                                index += 3;
1287                                continue;
1288                            }
1289                        }
1290                        index += 1;
1291                    }
1292                    control if control.is_ascii_alphabetic() => {
1293                        let start = index;
1294                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1295                            index += 1;
1296                        }
1297                        let control_word: String = chars[start..index].iter().collect();
1298
1299                        let number_start = index;
1300                        if index < chars.len()
1301                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1302                        {
1303                            index += 1;
1304                            while index < chars.len() && chars[index].is_ascii_digit() {
1305                                index += 1;
1306                            }
1307                        }
1308                        let parameter: String = chars[number_start..index].iter().collect();
1309
1310                        if index < chars.len() && chars[index] == ' ' {
1311                            index += 1;
1312                        }
1313
1314                        match control_word.as_str() {
1315                            "par" | "line" => output.push('\n'),
1316                            "tab" => output.push('\t'),
1317                            "emdash" => output.push('—'),
1318                            "endash" => output.push('–'),
1319                            "bullet" => output.push('•'),
1320                            "lquote" | "rquote" => output.push('\''),
1321                            "ldblquote" | "rdblquote" => output.push('"'),
1322                            "u" => {
1323                                if let Ok(codepoint) = parameter.parse::<i32>() {
1324                                    let normalized = if codepoint < 0 {
1325                                        codepoint + 65_536
1326                                    } else {
1327                                        codepoint
1328                                    };
1329                                    if let Ok(normalized) = u32::try_from(normalized)
1330                                        && let Some(ch) = char::from_u32(normalized)
1331                                    {
1332                                        output.push(ch);
1333                                    }
1334                                }
1335
1336                                if index < chars.len()
1337                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1338                                {
1339                                    index += 1;
1340                                }
1341                            }
1342                            _ => {}
1343                        }
1344                    }
1345                    _ => {
1346                        index += 1;
1347                    }
1348                }
1349            }
1350            ch => {
1351                output.push(ch);
1352                index += 1;
1353            }
1354        }
1355    }
1356
1357    output
1358        .replace(['\r', '\u{0c}'], "\n")
1359        .lines()
1360        .map(str::trim_end)
1361        .collect::<Vec<_>>()
1362        .join("\n")
1363}
1364
1365fn looks_like_gzip(bytes: &[u8]) -> bool {
1366    bytes.starts_with(&[0x1f, 0x8b])
1367}
1368
1369fn looks_like_bzip2(bytes: &[u8]) -> bool {
1370    bytes.starts_with(b"BZh")
1371}
1372
1373fn looks_like_xz(bytes: &[u8]) -> bool {
1374    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1375}
1376
1377fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1378    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1379}
1380
1381fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1382    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1383}
1384
1385fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1386    lower_extension(path)
1387        .as_deref()
1388        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1389        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1390            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1391}
1392
1393fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1394    if looks_like_deb(bytes, path) {
1395        "debian binary package (format 2.0)".to_string()
1396    } else if looks_like_rpm(bytes, path) {
1397        "RPM package".to_string()
1398    } else if looks_like_squashfs(bytes, path) {
1399        "Squashfs filesystem".to_string()
1400    } else if looks_like_gzip(bytes) {
1401        "gzip compressed data".to_string()
1402    } else if looks_like_bzip2(bytes) {
1403        "bzip2 compressed data".to_string()
1404    } else if looks_like_xz(bytes) {
1405        "XZ compressed data".to_string()
1406    } else if is_zip_archive(bytes) {
1407        "Zip archive data".to_string()
1408    } else if lower_extension(path).as_deref() == Some("gem") {
1409        "POSIX tar archive".to_string()
1410    } else if let Some(file_type) = format_based_file_type(detected_format) {
1411        file_type
1412    } else {
1413        "archive data".to_string()
1414    }
1415}
1416
1417fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1418    let suffix = text_executable_label(bytes);
1419
1420    match programming_language {
1421        Some("Python") => format!("python script, {suffix}"),
1422        Some("Ruby") => format!("ruby script, {suffix}"),
1423        Some("Perl") => format!("perl script, {suffix}"),
1424        Some("PHP") => format!("php script, {suffix}"),
1425        Some("Shell") => format!("shell script, {suffix}"),
1426        Some("Bash") => format!("bash script, {suffix}"),
1427        Some("Zsh") => format!("zsh script, {suffix}"),
1428        Some("Fish") => format!("fish script, {suffix}"),
1429        Some("Ksh") => format!("ksh script, {suffix}"),
1430        Some("JavaScript") => format!("javascript script, {suffix}"),
1431        Some("TypeScript") => format!("typescript script, {suffix}"),
1432        Some("PowerShell") => format!("powershell script, {suffix}"),
1433        Some("Awk") => format!("awk script, {suffix}"),
1434        _ => format!("script, {suffix}"),
1435    }
1436}
1437
1438fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1439    let suffix = text_label(bytes);
1440    match programming_language {
1441        Some("C") => format!("C source, {suffix}"),
1442        Some("C++") => format!("C++ source, {suffix}"),
1443        Some("Java") => format!("Java source, {suffix}"),
1444        Some("C#") => format!("C# source, {suffix}"),
1445        Some("F#") => format!("F# source, {suffix}"),
1446        Some("Go") => format!("Go source, {suffix}"),
1447        Some("Rust") => format!("Rust source, {suffix}"),
1448        Some("Starlark") => format!("Starlark source, {suffix}"),
1449        Some("CMake") => format!("CMake source, {suffix}"),
1450        Some("Meson") => format!("Meson source, {suffix}"),
1451        Some("Nix") => format!("Nix source, {suffix}"),
1452        Some("Groovy") => format!("Groovy source, {suffix}"),
1453        Some("Makefile") => format!("Makefile source, {suffix}"),
1454        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1455        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1456        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1457        Some(language) => format!("{language} source, {suffix}"),
1458        None => text_file_type(bytes),
1459    }
1460}
1461
1462fn text_file_type(bytes: &[u8]) -> String {
1463    text_label(bytes).to_string()
1464}
1465
1466fn text_label(bytes: &[u8]) -> &'static str {
1467    if std::str::from_utf8(bytes).is_ok() {
1468        if bytes.contains(&b'\n') {
1469            "UTF-8 Unicode text"
1470        } else {
1471            "UTF-8 Unicode text, with no line terminators"
1472        }
1473    } else if bytes.contains(&b'\n') {
1474        "text"
1475    } else {
1476        "text, with no line terminators"
1477    }
1478}
1479
1480fn text_executable_label(bytes: &[u8]) -> &'static str {
1481    if std::str::from_utf8(bytes).is_ok() {
1482        if bytes.contains(&b'\n') {
1483            "UTF-8 Unicode text executable"
1484        } else {
1485            "UTF-8 Unicode text executable, with no line terminators"
1486        }
1487    } else if bytes.contains(&b'\n') {
1488        "text executable"
1489    } else {
1490        "text executable, with no line terminators"
1491    }
1492}
1493
1494fn supported_image_metadata_format(
1495    ext: Option<&str>,
1496    detected_format: FileFormat,
1497) -> Option<ImageFormat> {
1498    match ext {
1499        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1500        Some("png") => Some(ImageFormat::Png),
1501        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1502        Some("webp") => Some(ImageFormat::WebP),
1503        _ => match detected_format.media_type() {
1504            "image/jpeg" => Some(ImageFormat::Jpeg),
1505            "image/png" => Some(ImageFormat::Png),
1506            "image/tiff" => Some(ImageFormat::Tiff),
1507            "image/webp" => Some(ImageFormat::WebP),
1508            _ => None,
1509        },
1510    }
1511}
1512
1513fn should_skip_binary_string_extraction(
1514    path: &Path,
1515    bytes: &[u8],
1516    detected_format: FileFormat,
1517) -> bool {
1518    matches!(lower_extension(path).as_deref(), Some("pdf"))
1519        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1520            .is_some()
1521        || (matches!(
1522            detected_format.kind(),
1523            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1524        ) && !is_textual_format(detected_format))
1525        || media_mime_from_content(bytes).is_some()
1526        || is_zip_archive(bytes)
1527        || looks_like_gzip(bytes)
1528        || looks_like_bzip2(bytes)
1529        || looks_like_xz(bytes)
1530        || looks_like_deb(bytes, path)
1531        || looks_like_rpm(bytes, path)
1532        || looks_like_squashfs(bytes, path)
1533}
1534
1535fn should_skip_large_opaque_binary_text_extraction(
1536    _path: &Path,
1537    bytes: &[u8],
1538    detected_format: FileFormat,
1539) -> bool {
1540    is_large_opaque_binary_candidate(bytes, detected_format)
1541        && !sample_has_promising_printable_strings(bytes)
1542}
1543
1544fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1545    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1546        && !is_textual_format(detected_format)
1547        && !matches!(
1548            detected_format.kind(),
1549            FileFormatKind::Archive
1550                | FileFormatKind::Compressed
1551                | FileFormatKind::Package
1552                | FileFormatKind::Audio
1553                | FileFormatKind::Image
1554                | FileFormatKind::Video
1555        )
1556}
1557
1558fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1559    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1560
1561    let mut ranges = Vec::new();
1562    let mut push_range = |start: usize, end: usize| {
1563        if start < end && !ranges.contains(&(start, end)) {
1564            ranges.push((start, end));
1565        }
1566    };
1567
1568    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1569    if len > SAMPLE_WINDOW_BYTES * 2 {
1570        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1571        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1572        push_range(mid_start, mid_end);
1573    }
1574    if len > SAMPLE_WINDOW_BYTES {
1575        push_range(len - SAMPLE_WINDOW_BYTES, len);
1576    }
1577
1578    ranges
1579}
1580
1581fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1582    let mut structured_signal_seen = false;
1583    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1584        .into_iter()
1585        .filter(|&(start, end)| {
1586            let window = &bytes[start..end];
1587            if has_strong_structured_text_signal(window) {
1588                structured_signal_seen = true;
1589            }
1590            has_license_or_notice_signal(window)
1591        })
1592        .count();
1593
1594    structured_signal_seen || promising_license_windows >= 2
1595}
1596
1597fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1598    let mut combined_lines = BTreeSet::new();
1599
1600    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1601        let window_text = extract_printable_strings(&bytes[start..end]);
1602        for line in window_text
1603            .lines()
1604            .map(str::trim)
1605            .filter(|line| !line.is_empty())
1606        {
1607            combined_lines.insert(line.to_string());
1608        }
1609    }
1610
1611    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1612}
1613
1614fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1615    let strings = extract_printable_strings(bytes);
1616    if strings.is_empty() {
1617        return false;
1618    }
1619
1620    let lower = strings.to_ascii_lowercase();
1621    [
1622        "copyright",
1623        "license",
1624        "licensed under",
1625        "all rights reserved",
1626        "permission is hereby granted",
1627        "redistribution and use",
1628        "spdx-license-identifier",
1629    ]
1630    .iter()
1631    .any(|marker| lower.contains(marker))
1632}
1633
1634fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1635    let strings = extract_printable_strings(bytes);
1636    if strings.is_empty() {
1637        return false;
1638    }
1639
1640    let email_markers = strings.matches('@').count();
1641    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1642
1643    email_markers + url_markers >= 3
1644}
1645
1646fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1647    match format {
1648        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1649        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1650        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1651        ImageFormat::WebP => {
1652            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1653        }
1654        _ => false,
1655    }
1656}
1657
1658fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1659    let mut values = Vec::new();
1660    values.extend(extract_exif_metadata_values(bytes));
1661    values.extend(extract_xmp_metadata_values(bytes, format));
1662    values_to_text(values)
1663}
1664
1665fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1666    let mut cursor = BufReader::new(Cursor::new(bytes));
1667    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1668        Ok(exif) => exif,
1669        Err(_) => return Vec::new(),
1670    };
1671
1672    let mut values = Vec::new();
1673    for field in exif.fields() {
1674        let rendered = match field.tag {
1675            exif::Tag::ImageDescription => Some(format_metadata_field(
1676                "Description",
1677                &field.display_value().with_unit(&exif).to_string(),
1678            )),
1679            exif::Tag::Copyright => Some(format_metadata_field(
1680                "Copyright",
1681                &field.display_value().with_unit(&exif).to_string(),
1682            )),
1683            exif::Tag::UserComment => Some(format_metadata_field(
1684                "Comment",
1685                &field.display_value().with_unit(&exif).to_string(),
1686            )),
1687            exif::Tag::Artist => Some(format_metadata_field(
1688                "Author",
1689                &field.display_value().with_unit(&exif).to_string(),
1690            )),
1691            _ => None,
1692        };
1693
1694        if let Some(rendered) = rendered {
1695            values.push(rendered);
1696        }
1697    }
1698
1699    values
1700}
1701
1702fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1703    let xmp = match extract_raw_xmp_packet(bytes, format) {
1704        Some(xmp) => xmp,
1705        None => return Vec::new(),
1706    };
1707
1708    parse_xmp_values(&xmp)
1709}
1710
1711fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1712    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1713    if let Ok(mut decoder) = reader.into_decoder()
1714        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1715    {
1716        return (xmp.len() <= MAX_XMP_PACKET_BYTES).then_some(xmp);
1717    }
1718
1719    match format {
1720        ImageFormat::Png => extract_png_xmp_packet(bytes),
1721        _ => None,
1722    }
1723}
1724
1725fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1726    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1727
1728    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1729        return None;
1730    }
1731
1732    let mut offset = PNG_SIGNATURE.len();
1733    while offset + 12 <= bytes.len() {
1734        let length = u32::from_be_bytes([
1735            bytes[offset],
1736            bytes[offset + 1],
1737            bytes[offset + 2],
1738            bytes[offset + 3],
1739        ]) as usize;
1740        let chunk_start = offset + 8;
1741        let chunk_end = chunk_start + length;
1742        if chunk_end + 4 > bytes.len() {
1743            return None;
1744        }
1745
1746        let chunk_type = &bytes[offset + 4..offset + 8];
1747        if chunk_type == b"iTXt" {
1748            let data = &bytes[chunk_start..chunk_end];
1749            if let Some(xmp) = parse_png_itxt_xmp(data) {
1750                return Some(xmp);
1751            }
1752        }
1753
1754        offset = chunk_end + 4;
1755    }
1756
1757    None
1758}
1759
1760fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1761    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1762
1763    let keyword_end = data.iter().position(|&b| b == 0)?;
1764    if &data[..keyword_end] != XMP_KEYWORD {
1765        return None;
1766    }
1767
1768    let mut cursor = keyword_end + 1;
1769    let compression_flag = *data.get(cursor)?;
1770    cursor += 1;
1771    let compression_method = *data.get(cursor)?;
1772    cursor += 1;
1773    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1774        return None;
1775    }
1776
1777    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1778    cursor = language_end + 1;
1779
1780    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1781    cursor = translated_end + 1;
1782
1783    let text_bytes = &data[cursor..];
1784    if compression_flag == 1 {
1785        let decoder = ZlibDecoder::new(text_bytes);
1786        let mut decoded = Vec::new();
1787        decoder
1788            .take((MAX_XMP_PACKET_BYTES + 1) as u64)
1789            .read_to_end(&mut decoded)
1790            .ok()?;
1791        (decoded.len() <= MAX_XMP_PACKET_BYTES).then_some(decoded)
1792    } else {
1793        (text_bytes.len() <= MAX_XMP_PACKET_BYTES).then(|| text_bytes.to_vec())
1794    }
1795}
1796
1797fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1798    let mut reader = XmlReader::from_reader(xmp);
1799    reader.config_mut().trim_text(true);
1800
1801    let mut buf = Vec::new();
1802    let mut stack: Vec<String> = Vec::new();
1803    let mut values = Vec::new();
1804
1805    loop {
1806        match reader.read_event_into(&mut buf) {
1807            Ok(Event::Start(e)) => {
1808                stack.push(local_xml_name(e.name().as_ref()));
1809            }
1810            Ok(Event::End(_)) => {
1811                stack.pop();
1812            }
1813            Ok(Event::Empty(_)) => {}
1814            Ok(Event::Text(text)) => {
1815                if let Some(field) = stack
1816                    .iter()
1817                    .rev()
1818                    .find_map(|name| allowed_xmp_field(name.as_str()))
1819                    && let Ok(decoded) = text.decode()
1820                {
1821                    let decoded = decoded.into_owned();
1822                    if !decoded.trim().is_empty() {
1823                        values.push(format_xmp_value(field, &decoded));
1824                    }
1825                }
1826            }
1827            Ok(Event::CData(text)) => {
1828                if let Some(field) = stack
1829                    .iter()
1830                    .rev()
1831                    .find_map(|name| allowed_xmp_field(name.as_str()))
1832                    && let Ok(decoded) = text.decode()
1833                {
1834                    let decoded = decoded.into_owned();
1835                    if !decoded.trim().is_empty() {
1836                        values.push(format_xmp_value(field, &decoded));
1837                    }
1838                }
1839            }
1840            Ok(Event::Eof) | Err(_) => break,
1841            _ => {}
1842        }
1843        buf.clear();
1844    }
1845
1846    values
1847}
1848
1849fn local_xml_name(name: &[u8]) -> String {
1850    let name = std::str::from_utf8(name).unwrap_or_default();
1851    name.rsplit(':').next().unwrap_or(name).to_string()
1852}
1853
1854fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1855    match name {
1856        "creator" => Some("creator"),
1857        "rights" => Some("rights"),
1858        "description" => Some("description"),
1859        "title" => Some("title"),
1860        "subject" => Some("subject"),
1861        "UsageTerms" => Some("usage_terms"),
1862        "WebStatement" => Some("web_statement"),
1863        _ => None,
1864    }
1865}
1866
1867fn format_xmp_value(field: &str, value: &str) -> String {
1868    match field {
1869        "creator" => format_metadata_field("Author", value),
1870        "rights" => format_metadata_field("Copyright", value),
1871        "description" => format_metadata_field("Description", value),
1872        "title" => format_metadata_field("Title", value),
1873        "subject" => format_metadata_field("Subject", value),
1874        "usage_terms" => format_metadata_field("UsageTerms", value),
1875        "web_statement" => format_metadata_field("WebStatement", value),
1876        _ => value.to_string(),
1877    }
1878}
1879
1880fn format_metadata_field(label: &str, value: &str) -> String {
1881    format!("{label}: {value}")
1882}
1883
1884fn values_to_text(values: Vec<String>) -> String {
1885    let mut seen = BTreeSet::new();
1886    let mut normalized_lines = Vec::new();
1887
1888    for value in values {
1889        let normalized = normalize_metadata_value(&value);
1890        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1891            continue;
1892        }
1893
1894        normalized_lines.push(normalized);
1895    }
1896
1897    let author_values: BTreeSet<String> = normalized_lines
1898        .iter()
1899        .filter_map(|line| split_metadata_field(line))
1900        .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
1901        .map(|(_, value)| value.to_string())
1902        .collect();
1903
1904    let mut lines = Vec::new();
1905    let mut total_bytes = 0usize;
1906
1907    for normalized in normalized_lines {
1908        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1909            break;
1910        }
1911
1912        if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
1913            continue;
1914        }
1915
1916        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1917        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1918            break;
1919        }
1920
1921        total_bytes += added_bytes;
1922        lines.push(normalized);
1923    }
1924
1925    lines.join("\n")
1926}
1927
1928fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
1929    let (label, value) = line.split_once(':')?;
1930    Some((label.trim(), value.trim()))
1931}
1932
1933fn should_suppress_bare_copyright_metadata_line(
1934    line: &str,
1935    author_values: &BTreeSet<String>,
1936) -> bool {
1937    let Some((label, value)) = split_metadata_field(line) else {
1938        return false;
1939    };
1940    if !label.eq_ignore_ascii_case("Copyright")
1941        || value.is_empty()
1942        || !author_values.contains(value)
1943    {
1944        return false;
1945    }
1946
1947    let lower = value.to_ascii_lowercase();
1948    !lower.contains("copyright")
1949        && !lower.contains("(c)")
1950        && !lower.contains('©')
1951        && !lower.contains("all rights")
1952        && !value.chars().any(|ch| ch.is_ascii_digit())
1953}
1954
1955fn normalize_metadata_value(value: &str) -> String {
1956    value
1957        .chars()
1958        .filter(|&ch| ch != '\0')
1959        .collect::<String>()
1960        .split_whitespace()
1961        .collect::<Vec<_>>()
1962        .join(" ")
1963        .trim()
1964        .to_string()
1965}
1966
1967fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1968    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1969        return (String::new(), None);
1970    }
1971
1972    if bytes.len() > MAX_PDF_TEXT_EXTRACTION_BYTES {
1973        return (
1974            String::new(),
1975            Some(format!(
1976                "PDF text extraction skipped because file exceeds {} bytes",
1977                MAX_PDF_TEXT_EXTRACTION_BYTES
1978            )),
1979        );
1980    }
1981
1982    let mut failures = Vec::new();
1983    let mut saw_success = false;
1984
1985    let extracted = catch_unwind(AssertUnwindSafe(
1986        || -> Result<String, Box<dyn std::error::Error>> {
1987            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1988            extract_first_pdf_page_text(&mut document)
1989        },
1990    ));
1991    match extracted {
1992        Ok(Ok(text)) => {
1993            saw_success = true;
1994            if let Some(normalized) = normalize_pdf_text(text) {
1995                return (normalized, None);
1996            }
1997        }
1998        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1999        Err(payload) => failures.push(format!(
2000            "from-bytes first-page panic: {}",
2001            panic_payload_to_string(payload.as_ref())
2002        )),
2003    }
2004
2005    let extracted = catch_unwind(AssertUnwindSafe(
2006        || -> Result<String, Box<dyn std::error::Error>> {
2007            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
2008            extract_pdf_text_from_document(&mut document)
2009        },
2010    ));
2011    match extracted {
2012        Ok(Ok(text)) => {
2013            saw_success = true;
2014            if let Some(normalized) = normalize_pdf_text(text) {
2015                return (normalized, None);
2016            }
2017        }
2018        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
2019        Err(payload) => failures.push(format!(
2020            "open full-document panic: {}",
2021            panic_payload_to_string(payload.as_ref())
2022        )),
2023    }
2024
2025    let extracted = catch_unwind(AssertUnwindSafe(
2026        || -> Result<String, Box<dyn std::error::Error>> {
2027            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
2028            extract_pdf_text_from_document(&mut document)
2029        },
2030    ));
2031    match extracted {
2032        Ok(Ok(text)) => {
2033            saw_success = true;
2034            if let Some(normalized) = normalize_pdf_text(text) {
2035                return (normalized, None);
2036            }
2037        }
2038        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
2039        Err(payload) => failures.push(format!(
2040            "from-bytes full-document panic: {}",
2041            panic_payload_to_string(payload.as_ref())
2042        )),
2043    }
2044
2045    if saw_success || is_non_actionable_pdf_failure(&failures) {
2046        (String::new(), None)
2047    } else {
2048        (
2049            String::new(),
2050            Some(format!(
2051                "PDF text extraction failed after {} attempts: {}",
2052                failures.len(),
2053                failures.join("; ")
2054            )),
2055        )
2056    }
2057}
2058
2059fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
2060    !failures.is_empty()
2061        && failures.iter().all(|failure| {
2062            failure.contains("requires a password")
2063                || failure.contains("Encrypt dictionary missing /O")
2064                || failure.contains("Encrypt dictionary missing /U")
2065                || failure.contains("security handler cannot be found")
2066                || failure.contains("Invalid cross-reference table")
2067        })
2068}
2069
2070fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
2071    if let Some(message) = payload.downcast_ref::<&str>() {
2072        (*message).to_string()
2073    } else if let Some(message) = payload.downcast_ref::<String>() {
2074        message.clone()
2075    } else {
2076        "unknown panic payload".to_string()
2077    }
2078}
2079
2080fn extract_first_pdf_page_text(
2081    document: &mut pdf_oxide::document::PdfDocument,
2082) -> Result<String, Box<dyn std::error::Error>> {
2083    if document.page_count()? == 0 {
2084        return Ok(String::new());
2085    }
2086
2087    let extracted_text = document.extract_text(0)?;
2088    let markdown_text =
2089        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
2090    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
2091        return Ok(extracted_text);
2092    }
2093
2094    let pipeline_text =
2095        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
2096
2097    Ok(merge_pdf_first_page_text(
2098        &extracted_text,
2099        &markdown_text,
2100        &pipeline_text,
2101    ))
2102}
2103
2104fn extract_pdf_text_from_document(
2105    document: &mut pdf_oxide::document::PdfDocument,
2106) -> Result<String, Box<dyn std::error::Error>> {
2107    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
2108}
2109
2110fn normalize_pdf_text(text: String) -> Option<String> {
2111    let normalized = text.replace(['\r', '\u{0c}'], "\n");
2112    (!normalized.trim().is_empty()).then_some(normalized)
2113}
2114
2115fn merge_pdf_first_page_text(
2116    _extracted_text: &str,
2117    markdown_text: &str,
2118    pipeline_text: &str,
2119) -> String {
2120    let pipeline = pipeline_text.trim();
2121    if pipeline.is_empty() {
2122        return String::new();
2123    }
2124
2125    let prefix = pdf_first_page_heading_prefix(markdown_text);
2126    let Some(prefix) = prefix else {
2127        return pipeline_text.to_string();
2128    };
2129
2130    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
2131        pipeline_text.to_string()
2132    } else {
2133        format!("{prefix}\n\n{pipeline}")
2134    }
2135}
2136
2137fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
2138    normalize_pdf_heading_comparison_text(text)
2139        .contains(&normalize_pdf_heading_comparison_text(prefix))
2140}
2141
2142fn normalize_pdf_heading_comparison_text(text: &str) -> String {
2143    text.split_whitespace()
2144        .map(|part| part.to_ascii_lowercase())
2145        .collect::<Vec<_>>()
2146        .join(" ")
2147}
2148
2149fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
2150    let mut lines = Vec::new();
2151
2152    for line in pdf_markdown_heading_lines(markdown_text) {
2153        push_unique_line(&mut lines, line);
2154    }
2155
2156    (!lines.is_empty()).then(|| lines.join("\n"))
2157}
2158
2159fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
2160    text.lines()
2161        .map(str::trim)
2162        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
2163        .map(|line| line.trim_matches('#').trim())
2164        .filter(|line| !line.is_empty())
2165        .filter(|line| !looks_like_numbered_section_heading(line))
2166        .take(4)
2167        .map(ToOwned::to_owned)
2168        .collect()
2169}
2170
2171fn push_unique_line(lines: &mut Vec<String>, line: String) {
2172    if !lines.iter().any(|existing| existing == &line) {
2173        lines.push(line);
2174    }
2175}
2176
2177fn looks_like_numbered_section_heading(line: &str) -> bool {
2178    let mut chars = line.chars();
2179    let Some(first) = chars.next() else {
2180        return false;
2181    };
2182
2183    if !first.is_ascii_digit() {
2184        return false;
2185    }
2186
2187    matches!(chars.next(), Some('.'))
2188}
2189
2190fn is_zip_archive(bytes: &[u8]) -> bool {
2191    bytes.starts_with(b"PK\x03\x04")
2192        || bytes.starts_with(b"PK\x05\x06")
2193        || bytes.starts_with(b"PK\x07\x08")
2194}
2195
2196pub fn extract_printable_strings(bytes: &[u8]) -> String {
2197    const MIN_LEN: usize = 4;
2198    const MIN_OUTPUT_BYTES: usize = 2_000_000;
2199    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
2200
2201    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
2202
2203    fn is_printable_ascii(b: u8) -> bool {
2204        matches!(b, 0x20..=0x7E)
2205    }
2206
2207    let mut out = String::new();
2208    let mut run: Vec<u8> = Vec::new();
2209
2210    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
2211        if run.len() >= MIN_LEN {
2212            if !out.is_empty() {
2213                out.push('\n');
2214            }
2215            out.push_str(&String::from_utf8_lossy(run));
2216        }
2217        run.clear();
2218    };
2219
2220    for &b in bytes {
2221        if is_printable_ascii(b) {
2222            run.push(b);
2223        } else {
2224            flush_run(&mut out, &mut run);
2225            if out.len() >= max_output_bytes {
2226                return out;
2227            }
2228        }
2229    }
2230    flush_run(&mut out, &mut run);
2231    if out.len() >= max_output_bytes {
2232        return out;
2233    }
2234
2235    for start in 0..=1 {
2236        run.clear();
2237        let mut i = start;
2238        while i + 1 < bytes.len() {
2239            let b0 = bytes[i];
2240            let b1 = bytes[i + 1];
2241            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
2242            if is_printable_ascii(ch) && zero == 0 {
2243                run.push(ch);
2244            } else {
2245                flush_run(&mut out, &mut run);
2246                if out.len() >= max_output_bytes {
2247                    return out;
2248                }
2249            }
2250            i += 2;
2251        }
2252        flush_run(&mut out, &mut run);
2253        if out.len() >= max_output_bytes {
2254            return out;
2255        }
2256    }
2257
2258    out
2259}
2260
2261#[cfg(test)]
2262mod tests {
2263    use image::ImageFormat;
2264    use std::path::Path;
2265
2266    use crate::copyright::detect_copyrights;
2267
2268    use super::{
2269        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, MAX_PDF_TEXT_EXTRACTION_BYTES,
2270        MAX_XMP_PACKET_BYTES, classify_file_info, extract_printable_strings,
2271        extract_raw_xmp_packet, extract_text_for_detection,
2272        extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
2273        is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
2274        values_to_text, windows_metadata_or_empty_result,
2275    };
2276
2277    fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
2278        let mut out = Vec::new();
2279        out.extend_from_slice(&(data.len() as u32).to_be_bytes());
2280        out.extend_from_slice(chunk_type);
2281        out.extend_from_slice(data);
2282        out.extend_from_slice(&0u32.to_be_bytes());
2283        out
2284    }
2285
2286    fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
2287        let mut bytes = Vec::new();
2288        bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");
2289
2290        let ihdr = [
2291            0, 0, 0, 1, // width
2292            0, 0, 0, 1, // height
2293            8, // bit depth
2294            2, // color type
2295            0, // compression
2296            0, // filter
2297            0, // interlace
2298        ];
2299        bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));
2300
2301        let mut itxt = Vec::new();
2302        itxt.extend_from_slice(b"XML:com.adobe.xmp");
2303        itxt.push(0); // keyword terminator
2304        itxt.push(0); // compression flag
2305        itxt.push(0); // compression method
2306        itxt.push(0); // language tag terminator
2307        itxt.push(0); // translated keyword terminator
2308        itxt.extend_from_slice(xmp.as_bytes());
2309        bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));
2310
2311        bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
2312        bytes
2313    }
2314
2315    #[test]
2316    fn test_extract_text_for_detection_skips_jar_archives() {
2317        let path = Path::new(
2318            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2319        );
2320        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2321
2322        let (text, kind) = extract_text_for_detection(path, &bytes);
2323
2324        assert!(text.is_empty());
2325        assert_eq!(kind, ExtractedTextKind::None);
2326    }
2327
2328    #[test]
2329    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2330        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2331        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2332
2333        let (text, kind) = extract_text_for_detection(path, &bytes);
2334
2335        assert_eq!(kind, ExtractedTextKind::Pdf);
2336        assert!(text.contains("Redistribution and use in source and binary forms"));
2337    }
2338
2339    #[test]
2340    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2341        let path =
2342            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2343        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2344
2345        let (text, kind) = extract_text_for_detection(path, &bytes);
2346
2347        assert_eq!(kind, ExtractedTextKind::Pdf);
2348        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2349        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2350    }
2351
2352    #[test]
2353    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2354        let path =
2355            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2356        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2357
2358        let (text, kind) = extract_text_for_detection(path, &bytes);
2359
2360        assert_eq!(kind, ExtractedTextKind::Pdf);
2361
2362        let normalized = normalize_pdf_heading_comparison_text(&text);
2363        let heading =
2364            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2365        assert_eq!(normalized.matches(&heading).count(), 1);
2366    }
2367
2368    #[test]
2369    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2370        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2371        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2372
2373        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2374
2375        assert_eq!(kind, ExtractedTextKind::Pdf);
2376        assert!(text.contains("Redistribution and use in source and binary forms"));
2377    }
2378
2379    #[test]
2380    fn test_extract_text_for_detection_skips_oversized_pdf_payload() {
2381        let mut bytes = b"%PDF-1.7\n".to_vec();
2382        bytes.resize(MAX_PDF_TEXT_EXTRACTION_BYTES + 1, b'0');
2383
2384        let (text, kind, scan_error) =
2385            extract_text_for_detection_with_diagnostics(Path::new("oversized.pdf"), &bytes);
2386
2387        assert!(text.is_empty());
2388        assert_eq!(kind, ExtractedTextKind::None);
2389        assert!(
2390            scan_error
2391                .as_deref()
2392                .is_some_and(|message| message.contains("PDF text extraction skipped"))
2393        );
2394    }
2395
2396    #[test]
2397    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2398        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2399
2400        let (text, kind, scan_error) =
2401            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2402
2403        assert!(text.is_empty());
2404        assert_eq!(kind, ExtractedTextKind::None);
2405        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2406        assert!(scan_error.contains("PDF text extraction failed after"));
2407    }
2408
2409    #[test]
2410    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2411        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2412
2413        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2414
2415        assert!(text.is_empty());
2416        assert_eq!(kind, ExtractedTextKind::None);
2417    }
2418
2419    #[test]
2420    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2421        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2422        let text = b"Copyright 2026 Example Project!!!";
2423        bytes[..text.len()].copy_from_slice(text);
2424        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2425        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2426
2427        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2428
2429        assert_ne!(kind, ExtractedTextKind::None);
2430        assert!(text.contains("Copyright 2026 Example Project"));
2431    }
2432
2433    #[test]
2434    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2435        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2436        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2437        bytes[..noise.len()].copy_from_slice(noise);
2438        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2439        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2440
2441        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2442
2443        assert!(text.is_empty());
2444        assert_eq!(kind, ExtractedTextKind::None);
2445    }
2446
2447    #[test]
2448    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2449        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2450        let bytes = std::fs::read(path).expect("read PE fixture");
2451
2452        let (text, kind) = extract_text_for_detection(path, &bytes);
2453
2454        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2455        assert!(text.contains("License: This program is free software"));
2456        assert!(text.contains("LegalCopyright:"));
2457    }
2458
2459    #[test]
2460    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2461    {
2462        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2463        let mut bytes = std::fs::read(path).expect("read PE fixture");
2464        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2465
2466        let (text, kind) = extract_text_for_detection(path, &bytes);
2467
2468        assert_ne!(kind, ExtractedTextKind::None);
2469        assert!(!text.trim().is_empty());
2470    }
2471
2472    #[test]
2473    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2474        let (text, kind, scan_error) =
2475            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2476
2477        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2478        assert_eq!(text, "LegalCopyright: Example Corp");
2479        assert!(scan_error.is_none());
2480    }
2481
2482    #[test]
2483    fn test_format_xmp_value_labels_creator_and_title_fields() {
2484        assert_eq!(
2485            format_xmp_value("creator", "Chinmay Garde"),
2486            "Author: Chinmay Garde"
2487        );
2488        assert_eq!(
2489            format_xmp_value("title", "Bay Bridge At Night"),
2490            "Title: Bay Bridge At Night"
2491        );
2492        assert_eq!(
2493            format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
2494            "Description: Embarcadero in the evening on Delta 3200"
2495        );
2496    }
2497
2498    #[test]
2499    fn test_format_metadata_field_prefixes_exif_text() {
2500        assert_eq!(
2501            format_metadata_field("Author", "Chinmay Garde"),
2502            "Author: Chinmay Garde"
2503        );
2504        assert_eq!(
2505            format_metadata_field("Description", "Bay Bridge At Night"),
2506            "Description: Bay Bridge At Night"
2507        );
2508    }
2509
2510    #[test]
2511    fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
2512        let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
2513        let bytes = build_png_with_xmp(xmp);
2514
2515        let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);
2516
2517        assert_eq!(kind, ExtractedTextKind::ImageMetadata);
2518        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2519        assert!(
2520            text.contains("Title: Bay Bridge At Night"),
2521            "text: {text:?}"
2522        );
2523        assert!(
2524            text.contains("Description: Embarcadero in the evening on Delta 3200"),
2525            "text: {text:?}"
2526        );
2527
2528        let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
2529        assert_eq!(
2530            authors
2531                .iter()
2532                .map(|a| a.author.as_str())
2533                .collect::<Vec<_>>(),
2534            vec!["Chinmay Garde"],
2535            "authors: {authors:?}; text: {text:?}"
2536        );
2537    }
2538
2539    #[test]
2540    fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
2541        let text = values_to_text(vec![
2542            "Author: Chinmay Garde".to_string(),
2543            "Copyright: Chinmay Garde".to_string(),
2544            "Title: Bay Bridge At Night".to_string(),
2545        ]);
2546
2547        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
2548        assert!(
2549            text.contains("Title: Bay Bridge At Night"),
2550            "text: {text:?}"
2551        );
2552        assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
2553    }
2554
2555    #[test]
2556    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2557        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2558        let text = b"Copyright 2026 Example Project!!!";
2559        bytes[..text.len()].copy_from_slice(text);
2560
2561        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2562
2563        assert!(text.is_empty());
2564        assert_eq!(kind, ExtractedTextKind::None);
2565    }
2566
2567    #[test]
2568    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2569        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2570        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2571        bytes[..text.len()].copy_from_slice(text);
2572
2573        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2574
2575        assert_ne!(kind, ExtractedTextKind::None);
2576        assert!(text.contains("asn@redhat.com"));
2577        assert!(text.contains("https://publicsuffix.org/"));
2578    }
2579
2580    #[test]
2581    fn test_extract_text_for_detection_avoids_latin1_decode_for_binary_blob_noise() {
2582        let bytes = vec![
2583            0x28, 0x63, 0x29, 0x20, 0x4b, 0x30, 0x0e, 0x71, 0x86, 0x20, 0x62, 0x24, 0x4c,
2584        ];
2585
2586        let (text, kind) = extract_text_for_detection(Path::new("fixture.blb"), &bytes);
2587
2588        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2589        assert_eq!(text, "(c) K0\n b$L");
2590    }
2591
2592    #[test]
2593    fn test_extract_raw_xmp_packet_rejects_oversized_png_itxt_payload() {
2594        let xmp = "A".repeat(MAX_XMP_PACKET_BYTES + 1);
2595        let bytes = build_png_with_xmp(&xmp);
2596
2597        assert!(extract_raw_xmp_packet(&bytes, ImageFormat::Png).is_none());
2598    }
2599
2600    #[test]
2601    fn test_non_actionable_pdf_failures_are_suppressed() {
2602        assert!(is_non_actionable_pdf_failure(&[
2603            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2604            "open full-document: PDF is encrypted and requires a password".to_string(),
2605        ]));
2606        assert!(is_non_actionable_pdf_failure(&[
2607            "from-bytes first-page: Invalid cross-reference table".to_string(),
2608            "open full-document: Invalid cross-reference table".to_string(),
2609        ]));
2610        assert!(is_non_actionable_pdf_failure(&[
2611            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2612            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2613        ]));
2614        assert!(!is_non_actionable_pdf_failure(&[
2615            "from-bytes first-page: some other parser failure".to_string(),
2616        ]));
2617    }
2618
2619    #[test]
2620    fn test_extract_text_for_detection_skips_zip_like_archives() {
2621        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2622
2623        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2624        let (crate_text, crate_kind) =
2625            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2626
2627        assert!(whl_text.is_empty());
2628        assert_eq!(whl_kind, ExtractedTextKind::None);
2629        assert!(crate_text.is_empty());
2630        assert_eq!(crate_kind, ExtractedTextKind::None);
2631    }
2632
2633    #[test]
2634    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2635        let path =
2636            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2637        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2638
2639        let (text, kind) = extract_text_for_detection(path, &bytes);
2640
2641        assert_ne!(kind, ExtractedTextKind::None);
2642        assert!(text.contains("Copyright nexB and others (c) 2012"));
2643    }
2644
2645    #[test]
2646    fn test_extract_text_for_detection_reads_font_metadata() {
2647        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2648        let bytes = std::fs::read(path).expect("failed to read font fixture");
2649
2650        let (text, kind) = extract_text_for_detection(path, &bytes);
2651
2652        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2653        assert!(text.contains("License Description:"), "{text}");
2654        assert!(
2655            text.contains("Open Font License") || text.contains("OFL"),
2656            "{text}"
2657        );
2658        assert!(text.contains("Lato"), "{text}");
2659    }
2660
2661    #[test]
2662    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2663        let bytes = b"abcd\0".repeat(525_000);
2664
2665        let text = extract_printable_strings(&bytes);
2666
2667        assert!(
2668            text.len() > 2_000_000,
2669            "unexpected truncation at {}",
2670            text.len()
2671        );
2672        assert!(text.ends_with("abcd"));
2673    }
2674
2675    #[test]
2676    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2677        let path = Path::new(
2678            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2679        );
2680        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2681
2682        let (text, kind) = extract_text_for_detection(path, &bytes);
2683
2684        assert_eq!(kind, ExtractedTextKind::Decoded);
2685        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2686    }
2687
2688    #[test]
2689    fn test_extract_text_for_detection_preserves_blank_comment_lines_in_utf8_source() {
2690        let path = Path::new("testdata/plugin_email_url/files/IMarkerActionFilter.java");
2691        let bytes = std::fs::read(path).expect("failed to read java fixture");
2692
2693        let (text, kind) = extract_text_for_detection(path, &bytes);
2694
2695        assert_eq!(kind, ExtractedTextKind::Decoded);
2696        let lines: Vec<_> = text.lines().collect();
2697        assert_eq!(lines.get(2).copied(), Some(" *"));
2698        assert_eq!(
2699            lines.get(3).copied(),
2700            Some(" *https://github.com/rpm-software-management")
2701        );
2702        assert_eq!(lines.get(5).copied(), Some("https://gitlab.com/Conan_Kudo"));
2703    }
2704
2705    #[test]
2706    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2707        let path = Path::new(
2708            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2709        );
2710        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2711
2712        let (text, kind) = extract_text_for_detection(path, &bytes);
2713
2714        assert_eq!(kind, ExtractedTextKind::Decoded);
2715        assert!(text.contains("GNU Lesser General Public"));
2716        assert!(text.contains("version"));
2717        assert!(text.contains("2.1 of the License"));
2718    }
2719
2720    #[test]
2721    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2722        assert_eq!(
2723            normalize_mime_type(
2724                Path::new("main.ts"),
2725                b"export const answer = 42;\n",
2726                Some("TypeScript"),
2727                "video/mp2t",
2728            ),
2729            "text/plain"
2730        );
2731    }
2732
2733    #[test]
2734    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2735        assert_eq!(
2736            normalize_mime_type(
2737                Path::new("main.js"),
2738                b"console.log('hello');\n",
2739                Some("JavaScript"),
2740                "application/octet-stream",
2741            ),
2742            "text/plain"
2743        );
2744    }
2745
2746    #[test]
2747    fn test_normalize_mime_type_preserves_binary_video_guess() {
2748        assert_eq!(
2749            normalize_mime_type(
2750                Path::new("main.ts"),
2751                &[0, 159, 146, 150, 0, 1, 2, 3],
2752                Some("TypeScript"),
2753                "video/mp2t",
2754            ),
2755            "video/mp2t"
2756        );
2757    }
2758
2759    #[test]
2760    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2761        assert_eq!(
2762            normalize_mime_type(
2763                Path::new("main.ts"),
2764                &[0, 159, 146, 150],
2765                Some("TypeScript"),
2766                "application/octet-stream",
2767            ),
2768            "application/octet-stream"
2769        );
2770    }
2771
2772    #[test]
2773    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2774        let classification = classify_file_info(Path::new("test.txt"), b"");
2775
2776        assert_eq!(classification.mime_type, "inode/x-empty");
2777        assert_eq!(classification.file_type, "empty");
2778        assert!(!classification.is_binary);
2779        assert!(classification.is_text);
2780        assert!(!classification.is_source);
2781        assert_eq!(classification.programming_language, None);
2782    }
2783
2784    #[test]
2785    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2786        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2787
2788        assert_eq!(classification.mime_type, "application/json");
2789        assert_eq!(classification.file_type, "JSON text data");
2790        assert!(classification.is_text);
2791        assert!(!classification.is_source);
2792        assert_eq!(classification.programming_language, None);
2793    }
2794
2795    #[test]
2796    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2797        let classification =
2798            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2799
2800        assert_eq!(classification.mime_type, "text/plain");
2801        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2802        assert!(classification.is_text);
2803        assert!(!classification.is_binary);
2804    }
2805
2806    #[test]
2807    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2808        let classification =
2809            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2810
2811        assert_eq!(classification.mime_type, "application/octet-stream");
2812        assert_eq!(classification.file_type, "data");
2813        assert!(classification.is_binary);
2814        assert!(!classification.is_text);
2815    }
2816
2817    #[test]
2818    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2819        let classification = classify_file_info(
2820            Path::new("utf16.json"),
2821            &[
2822                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2823            ],
2824        );
2825
2826        assert!(!classification.is_binary);
2827        assert!(classification.is_text);
2828        assert_eq!(classification.mime_type, "application/json");
2829        assert_eq!(classification.file_type, "JSON text data");
2830    }
2831
2832    #[test]
2833    fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
2834        let classification = classify_file_info(
2835            Path::new("utf16be.json"),
2836            &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
2837        );
2838
2839        assert!(!classification.is_binary);
2840        assert!(classification.is_text);
2841        assert_eq!(classification.mime_type, "application/json");
2842        assert_eq!(classification.file_type, "JSON text data");
2843    }
2844
2845    #[test]
2846    fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
2847        let classification =
2848            classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);
2849
2850        assert!(!classification.is_binary);
2851        assert!(classification.is_text);
2852        assert_eq!(classification.mime_type, "application/json");
2853        assert_eq!(classification.file_type, "JSON text data");
2854    }
2855
2856    #[test]
2857    fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
2858        let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
2859        for code_unit in
2860            "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
2861        {
2862            bytes.extend_from_slice(&code_unit.to_be_bytes());
2863        }
2864
2865        let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);
2866
2867        assert_eq!(kind, ExtractedTextKind::Decoded);
2868        assert!(text.contains("Apache Software Foundation"), "{text}");
2869        assert!(text.contains("Apache License, Version 2.0"), "{text}");
2870    }
2871
2872    #[test]
2873    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2874        let classification = classify_file_info(Path::new("true.json"), b"true");
2875
2876        assert!(!classification.is_binary);
2877        assert!(classification.is_text);
2878        assert_eq!(classification.mime_type, "application/json");
2879        assert_eq!(classification.file_type, "JSON text data");
2880    }
2881
2882    #[test]
2883    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2884        let classification = classify_file_info(
2885            Path::new("wrapped.json"),
2886            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2887        );
2888
2889        assert!(!classification.is_binary);
2890        assert!(classification.is_text);
2891        assert_eq!(classification.mime_type, "text/plain");
2892        assert_eq!(classification.file_type, "text, with no line terminators");
2893    }
2894
2895    #[test]
2896    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2897        let classification =
2898            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2899
2900        assert!(classification.is_binary);
2901        assert!(!classification.is_text);
2902        assert_eq!(classification.mime_type, "application/octet-stream");
2903        assert_eq!(classification.file_type, "data");
2904    }
2905
2906    #[test]
2907    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2908        let classification = classify_file_info(
2909            Path::new("crash.json"),
2910            &[
2911                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2912            ],
2913        );
2914
2915        assert!(classification.is_binary);
2916        assert!(!classification.is_text);
2917        assert_eq!(classification.mime_type, "application/octet-stream");
2918    }
2919
2920    #[test]
2921    fn test_classify_file_info_treats_dockerfile_as_source() {
2922        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2923
2924        assert_eq!(
2925            classification.programming_language.as_deref(),
2926            Some("Dockerfile")
2927        );
2928        assert!(classification.is_source);
2929        assert!(!classification.is_script);
2930        assert_eq!(
2931            classification.file_type,
2932            "Dockerfile source, UTF-8 Unicode text"
2933        );
2934    }
2935
2936    #[test]
2937    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2938        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2939
2940        assert_eq!(classification.programming_language, None);
2941        assert!(classification.is_text);
2942        assert!(!classification.is_source);
2943        assert!(!classification.is_script);
2944        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2945    }
2946
2947    #[test]
2948    fn test_classify_file_info_marks_supported_package_archives() {
2949        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2950
2951        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2952        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2953
2954        assert!(egg.is_archive);
2955        assert_eq!(egg.mime_type, "application/zip");
2956        assert_eq!(egg.file_type, "Zip archive data");
2957        assert!(nupkg.is_archive);
2958        assert_eq!(nupkg.mime_type, "application/zip");
2959        assert_eq!(nupkg.file_type, "Zip archive data");
2960    }
2961
2962    #[test]
2963    fn test_classify_file_info_marks_png_as_binary_media() {
2964        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2965
2966        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2967
2968        assert_eq!(classification.mime_type, "image/png");
2969        assert_eq!(classification.file_type, "PNG image data");
2970        assert!(classification.is_binary);
2971        assert!(!classification.is_text);
2972        assert!(classification.is_media);
2973        assert!(!classification.is_archive);
2974        assert!(!classification.is_source);
2975    }
2976
2977    #[test]
2978    fn test_classify_file_info_marks_pdf_as_binary_document() {
2979        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2980
2981        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2982
2983        assert_eq!(classification.mime_type, "application/pdf");
2984        assert_eq!(classification.file_type, "PDF document");
2985        assert!(classification.is_binary);
2986        assert!(!classification.is_text);
2987        assert!(!classification.is_archive);
2988        assert!(!classification.is_media);
2989    }
2990
2991    #[test]
2992    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2993        let classification =
2994            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2995
2996        assert!(classification.is_binary);
2997        assert!(!classification.is_text);
2998        assert!(!classification.is_source);
2999        assert_eq!(classification.programming_language, None);
3000    }
3001
3002    #[test]
3003    fn test_classify_file_info_treats_yaml_as_text_not_source() {
3004        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
3005
3006        assert_eq!(classification.programming_language, None);
3007        assert!(classification.is_text);
3008        assert!(!classification.is_source);
3009        assert_eq!(classification.file_type, "YAML text data");
3010    }
3011
3012    #[test]
3013    fn test_classify_file_info_classifies_common_build_manifests() {
3014        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
3015        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
3016        let cmake = classify_file_info(
3017            Path::new("toolchain.cmake"),
3018            b"set(CMAKE_CXX_STANDARD 20)\n",
3019        );
3020        let gitmodules = classify_file_info(
3021            Path::new(".gitmodules"),
3022            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
3023        );
3024
3025        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
3026        assert!(gradle.is_source);
3027        assert_eq!(gradle.mime_type, "text/plain");
3028        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
3029
3030        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
3031        assert!(flake.is_source);
3032        assert_eq!(flake.mime_type, "text/plain");
3033        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
3034
3035        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
3036        assert!(cmake.is_source);
3037        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
3038
3039        assert_eq!(gitmodules.programming_language, None);
3040        assert!(gitmodules.is_text);
3041        assert!(!gitmodules.is_source);
3042        assert_eq!(gitmodules.file_type, "Git configuration text");
3043    }
3044
3045    #[test]
3046    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
3047        let header = classify_file_info(
3048            Path::new("include/demo.hpp"),
3049            b"#pragma once\nclass Demo {};\n",
3050        );
3051        let ipp = classify_file_info(
3052            Path::new("include/detail/demo.ipp"),
3053            b"template <class T> void parse() {}\n",
3054        );
3055
3056        assert_eq!(header.programming_language.as_deref(), Some("C++"));
3057        assert!(header.is_source);
3058        assert!(!header.is_script);
3059        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
3060
3061        assert_eq!(ipp.programming_language, None);
3062        assert!(!ipp.is_source);
3063        assert!(!ipp.is_script);
3064        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
3065    }
3066
3067    #[test]
3068    fn test_classify_file_info_preserves_specific_shell_family_labels() {
3069        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
3070
3071        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
3072        assert!(bash.is_script);
3073        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
3074    }
3075
3076    #[test]
3077    fn test_classify_file_info_marks_jamfile_as_source() {
3078        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
3079
3080        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
3081        assert!(jamfile.is_source);
3082        assert!(!jamfile.is_script);
3083        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
3084    }
3085
3086    #[test]
3087    fn test_classify_file_info_labels_javascript_shebang_scripts() {
3088        let classification = classify_file_info(
3089            Path::new("bin/run"),
3090            b"#!/usr/bin/env node\nconsole.log('hello');\n",
3091        );
3092
3093        assert_eq!(
3094            classification.programming_language.as_deref(),
3095            Some("JavaScript")
3096        );
3097        assert!(classification.is_script);
3098        assert_eq!(
3099            classification.file_type,
3100            "javascript script, UTF-8 Unicode text executable"
3101        );
3102    }
3103
3104    #[test]
3105    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
3106        let classification = classify_file_info(
3107            Path::new("script.py"),
3108            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
3109        );
3110
3111        assert_eq!(
3112            classification.programming_language.as_deref(),
3113            Some("Python")
3114        );
3115        assert!(classification.is_script);
3116        assert_eq!(classification.file_type, "python script, text executable");
3117    }
3118
3119    #[test]
3120    fn test_classify_file_info_treats_textual_tga_as_media() {
3121        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
3122
3123        assert!(classification.is_media);
3124        assert!(classification.is_text);
3125        assert!(!classification.is_binary);
3126    }
3127
3128    #[test]
3129    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
3130        let classification =
3131            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
3132
3133        assert!(classification.is_binary);
3134        assert!(!classification.is_text);
3135        assert!(!classification.is_source);
3136        assert_eq!(classification.programming_language, None);
3137    }
3138
3139    #[test]
3140    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
3141        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
3142
3143        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
3144
3145        assert!(text.is_empty());
3146        assert_eq!(kind, ExtractedTextKind::None);
3147    }
3148
3149    #[test]
3150    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
3151        let cases = [
3152            (
3153                Path::new("bin/run"),
3154                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
3155                Some("JavaScript"),
3156                true,
3157                true,
3158            ),
3159            (
3160                Path::new("Dockerfile"),
3161                b"FROM scratch\n".as_slice(),
3162                Some("Dockerfile"),
3163                true,
3164                false,
3165            ),
3166            (
3167                Path::new("package.json"),
3168                br#"{"name":"demo"}"#.as_slice(),
3169                None,
3170                false,
3171                false,
3172            ),
3173            (
3174                Path::new("config.yaml"),
3175                b"key: value\n".as_slice(),
3176                None,
3177                false,
3178                false,
3179            ),
3180            (
3181                Path::new("Makefile"),
3182                b"all:\n\techo hi\n".as_slice(),
3183                None,
3184                false,
3185                false,
3186            ),
3187        ];
3188
3189        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
3190            let classification = classify_file_info(path, bytes);
3191
3192            assert_eq!(
3193                classification.programming_language.as_deref(),
3194                expected_language,
3195                "unexpected language for {}",
3196                path.display()
3197            );
3198            assert_eq!(
3199                classification.is_source,
3200                expected_is_source,
3201                "unexpected is_source for {}",
3202                path.display()
3203            );
3204            assert_eq!(
3205                classification.is_script,
3206                expected_is_script,
3207                "unexpected is_script for {}",
3208                path.display()
3209            );
3210        }
3211    }
3212}