Skip to main content

provenant/utils/
file.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26    None,
27    Decoded,
28    FontMetadata,
29    Pdf,
30    BinaryStrings,
31    ImageMetadata,
32    WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37    pub mime_type: String,
38    pub file_type: String,
39    pub programming_language: Option<String>,
40    pub is_binary: bool,
41    pub is_text: bool,
42    pub is_archive: bool,
43    pub is_media: bool,
44    pub is_source: bool,
45    pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64/// Get the last modified date of a file as a `YYYY-MM-DD` string.
65pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66    metadata.modified().ok().map(|time: std::time::SystemTime| {
67        let seconds_since_epoch = time
68            .duration_since(std::time::UNIX_EPOCH)
69            .unwrap()
70            .as_secs() as i64;
71
72        Utc.timestamp_opt(seconds_since_epoch, 0)
73            .single()
74            .unwrap_or_else(Utc::now)
75            .format("%Y-%m-%d")
76            .to_string()
77    })
78}
79
80/// Check if a path should be excluded based on a list of glob patterns.
81pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82    let path_str = path.to_string_lossy();
83    let file_name = path
84        .file_name()
85        .map(|name| name.to_string_lossy())
86        .unwrap_or_default();
87
88    for pattern in exclude_patterns {
89        // Match against full path
90        if pattern.matches(&path_str) {
91            return true;
92        }
93
94        // Match against just the file/directory name
95        if pattern.matches(&file_name) {
96            return true;
97        }
98    }
99
100    false
101}
102
103/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
104///
105/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
106/// so it can decode any byte sequence. This matches Python ScanCode's use of
107/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
108pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
109    match String::from_utf8(bytes.to_vec()) {
110        Ok(s) => s,
111        Err(e) => {
112            let bytes = e.into_bytes();
113            if has_binary_control_chars(&bytes) {
114                return String::new();
115            }
116            bytes.iter().map(|&b| b as char).collect()
117        }
118    }
119}
120
121pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
122    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
123    (text, kind)
124}
125
126pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
127    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
128        return Cow::Borrowed(text);
129    };
130    if !matches!(
131        extension.to_ascii_lowercase().as_str(),
132        "md" | "markdown" | "html" | "htm"
133    ) {
134        return Cow::Borrowed(text);
135    }
136
137    let mut hints = Vec::new();
138    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
139        hints.push("Creative Commons Attribution 4.0 International License".to_string());
140    }
141    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
142    {
143        hints.push(
144            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
145                .to_string(),
146        );
147    }
148
149    hints.extend(extract_shields_license_badge_hints(text));
150
151    if hints.is_empty() {
152        Cow::Borrowed(text)
153    } else {
154        let mut augmented =
155            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
156        augmented.push_str(text);
157        augmented.push_str("\n\n");
158        for (index, hint) in hints.into_iter().enumerate() {
159            if index > 0 {
160                augmented.push('\n');
161            }
162            augmented.push_str(&hint);
163        }
164        Cow::Owned(augmented)
165    }
166}
167
168fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
169    let mut hints = Vec::new();
170    let mut rest = text;
171    let needle = "img.shields.io/badge/license-";
172
173    while let Some(index) = rest.find(needle) {
174        let start = index + needle.len();
175        let suffix = &rest[start..];
176        let end = suffix
177            .find([')', ']', '"', '\'', ' ', '\n'])
178            .unwrap_or(suffix.len());
179        let badge = &suffix[..end];
180        let Some(badge) = badge.strip_suffix(".svg") else {
181            rest = &suffix[end..];
182            continue;
183        };
184
185        let mut segments: Vec<_> = badge
186            .split('-')
187            .filter(|segment| !segment.is_empty())
188            .collect();
189        if segments.len() < 2 {
190            rest = &suffix[end..];
191            continue;
192        }
193        segments.pop();
194        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
195        if !candidate.is_empty() {
196            hints.push(canonical_shields_license_hint(&candidate));
197        }
198
199        rest = &suffix[end..];
200    }
201
202    hints.sort();
203    hints.dedup();
204    hints
205}
206
207fn canonical_shields_license_hint(candidate: &str) -> String {
208    match candidate.trim() {
209        "MIT" => "The MIT License".to_string(),
210        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
211        other => format!("{other} License"),
212    }
213}
214
215pub(crate) fn extract_text_for_detection_with_diagnostics(
216    path: &Path,
217    bytes: &[u8],
218) -> (String, ExtractedTextKind, Option<String>) {
219    let ext = path
220        .extension()
221        .and_then(|e| e.to_str())
222        .map(|s| s.to_ascii_lowercase());
223    let detected_format = detect_file_format(bytes);
224
225    if looks_like_rtf(bytes, ext.as_deref()) {
226        let text = extract_rtf_text(bytes);
227        return if text.trim().is_empty() {
228            (String::new(), ExtractedTextKind::None, None)
229        } else {
230            (text, ExtractedTextKind::Decoded, None)
231        };
232    }
233
234    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
235        let (text, scan_error) = extract_pdf_text(path, bytes);
236        return if text.is_empty() {
237            (String::new(), ExtractedTextKind::None, scan_error)
238        } else {
239            (text, ExtractedTextKind::Pdf, None)
240        };
241    }
242
243    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
244        let text = extract_image_metadata_text(bytes, format);
245        return if text.is_empty() {
246            if is_supported_image_container(bytes, format) {
247                (String::new(), ExtractedTextKind::None, None)
248            } else {
249                let decoded = decode_bytes_to_string(bytes);
250                if decoded.is_empty() {
251                    (String::new(), ExtractedTextKind::None, None)
252                } else {
253                    (decoded, ExtractedTextKind::Decoded, None)
254                }
255            }
256        } else {
257            (text, ExtractedTextKind::ImageMetadata, None)
258        };
259    }
260
261    if let Some(text) = extract_font_metadata_text(path, bytes) {
262        let strings = extract_printable_strings(bytes);
263        let combined = if strings.is_empty() {
264            text
265        } else {
266            combine_extracted_text_fragments(Some(text), strings)
267        };
268        return (combined, ExtractedTextKind::FontMetadata, None);
269    }
270
271    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
272    let large_opaque_binary = windows_executable_metadata_text.is_none()
273        && is_large_opaque_binary_candidate(bytes, detected_format);
274
275    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
276        return windows_metadata_or_empty_result(windows_executable_metadata_text);
277    }
278
279    if should_skip_binary_string_extraction(path, bytes, detected_format) {
280        return (String::new(), ExtractedTextKind::None, None);
281    }
282
283    if !large_opaque_binary {
284        let decoded = decode_bytes_to_string(bytes);
285        if !decoded.is_empty() {
286            let combined =
287                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
288            return (combined, ExtractedTextKind::Decoded, None);
289        }
290    }
291
292    let text = if large_opaque_binary {
293        extract_sampled_printable_strings(bytes)
294    } else {
295        extract_printable_strings(bytes)
296    };
297    if text.is_empty() {
298        windows_metadata_or_empty_result(windows_executable_metadata_text)
299    } else {
300        (
301            combine_extracted_text_fragments(windows_executable_metadata_text, text),
302            ExtractedTextKind::BinaryStrings,
303            None,
304        )
305    }
306}
307
308fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
309    match prefix {
310        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
311        Some(prefix) if !prefix.is_empty() => prefix,
312        _ => suffix,
313    }
314}
315
316fn windows_metadata_or_empty_result(
317    windows_executable_metadata_text: Option<String>,
318) -> (String, ExtractedTextKind, Option<String>) {
319    if let Some(metadata_text) = windows_executable_metadata_text {
320        (
321            metadata_text,
322            ExtractedTextKind::WindowsExecutableMetadata,
323            None,
324        )
325    } else {
326        (String::new(), ExtractedTextKind::None, None)
327    }
328}
329
330pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
331    let detected_format = detect_file_format(bytes);
332    let detected_language = detect_language(path, bytes);
333    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
334    let is_text = !is_binary;
335    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
336    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
337    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
338    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
339    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
340    let programming_language = is_source.then(|| detected_language.clone()).flatten();
341    let file_type = detect_file_type(
342        path,
343        bytes,
344        detected_format,
345        &mime_type,
346        programming_language.as_deref(),
347        is_binary,
348        is_text,
349        is_archive,
350        is_media,
351        is_script,
352    );
353
354    FileInfoClassification {
355        mime_type,
356        file_type,
357        programming_language,
358        is_binary,
359        is_text,
360        is_archive,
361        is_media,
362        is_source,
363        is_script,
364    }
365}
366
367fn detect_file_format(bytes: &[u8]) -> FileFormat {
368    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
369}
370
371fn is_utf8_text(bytes: &[u8]) -> bool {
372    std::str::from_utf8(bytes).is_ok()
373}
374
375fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
376    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
377        return None;
378    }
379
380    let (is_le, body) = match bytes {
381        [0xFF, 0xFE, rest @ ..] => (true, rest),
382        [0xFE, 0xFF, rest @ ..] => (false, rest),
383        _ => return None,
384    };
385
386    if body.is_empty() || body.len() % 2 != 0 {
387        return None;
388    }
389
390    let code_units: Vec<u16> = body
391        .chunks_exact(2)
392        .map(|chunk| {
393            if is_le {
394                u16::from_le_bytes([chunk[0], chunk[1]])
395            } else {
396                u16::from_be_bytes([chunk[0], chunk[1]])
397            }
398        })
399        .collect();
400
401    std::char::decode_utf16(code_units)
402        .collect::<Result<String, _>>()
403        .ok()
404}
405
406fn has_binary_control_chars(bytes: &[u8]) -> bool {
407    let control_count = bytes
408        .iter()
409        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
410        .count();
411    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
412}
413
414fn has_decodable_text(bytes: &[u8]) -> bool {
415    bytes.is_empty()
416        || is_utf8_text(bytes)
417        || decode_utf16_bom_text(bytes).is_some()
418        || !has_binary_control_chars(bytes)
419}
420
421fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
422    if bytes.is_empty() || is_utf8_text(bytes) {
423        return true;
424    }
425    if let Some(decoded) = decode_utf16_bom_text(bytes) {
426        return decoded
427            .chars()
428            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
429    }
430
431    let printable_count = bytes
432        .iter()
433        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
434        .count();
435    printable_count * 2 >= bytes.len()
436}
437
438fn is_textual_media_type(media_type: &str) -> bool {
439    media_type.starts_with("text/")
440        || matches!(
441            media_type,
442            "application/json" | "application/xml" | "text/xml"
443        )
444        || media_type.ends_with("+json")
445        || media_type.ends_with("+xml")
446}
447
448fn is_textual_format(detected_format: FileFormat) -> bool {
449    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
450        || is_textual_media_type(detected_format.media_type())
451}
452
453fn is_known_binary_format(detected_format: FileFormat) -> bool {
454    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
455        && !is_textual_format(detected_format)
456}
457
458pub fn detect_mime_type(
459    path: &Path,
460    bytes: &[u8],
461    detected_format: FileFormat,
462    programming_language: Option<&str>,
463) -> String {
464    if bytes.is_empty() {
465        return "inode/x-empty".to_string();
466    }
467
468    if lower_extension(path).as_deref() == Some("json") {
469        if let Some(is_binary) = json_binary_override(bytes) {
470            if is_binary {
471                return "application/octet-stream".to_string();
472            }
473            if has_valid_json_text(bytes) {
474                return "application/json".to_string();
475            }
476            return "text/plain".to_string();
477        }
478        if has_valid_json_text(bytes) {
479            return "application/json".to_string();
480        }
481        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
482            return "text/plain".to_string();
483        }
484        return "application/octet-stream".to_string();
485    }
486
487    if is_zip_archive(bytes) {
488        return detect_zip_like_mime(path);
489    }
490
491    if looks_like_deb(bytes, path) {
492        return "application/vnd.debian.binary-package".to_string();
493    }
494
495    if looks_like_rpm(bytes, path) {
496        return "application/x-rpm".to_string();
497    }
498
499    let guessed_mime = from_path(path)
500        .first_or_octet_stream()
501        .essence_str()
502        .to_string();
503
504    let mime_type = match detected_format {
505        FileFormat::Empty => "inode/x-empty".to_string(),
506        FileFormat::PlainText => {
507            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
508                "text/plain".to_string()
509            } else {
510                guessed_mime.clone()
511            }
512        }
513        _ => {
514            let detected_mime = detected_format.media_type();
515            if detected_mime == "application/octet-stream"
516                && guessed_mime != "application/octet-stream"
517            {
518                guessed_mime.clone()
519            } else {
520                detected_mime.to_string()
521            }
522        }
523    };
524
525    normalize_mime_type(path, bytes, programming_language, &mime_type)
526}
527
528fn normalize_mime_type(
529    path: &Path,
530    bytes: &[u8],
531    programming_language: Option<&str>,
532    mime_type: &str,
533) -> String {
534    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
535        return "text/plain".to_string();
536    }
537
538    mime_type.to_string()
539}
540
541fn should_prefer_text_mime(
542    path: &Path,
543    bytes: &[u8],
544    programming_language: Option<&str>,
545    mime_type: &str,
546) -> bool {
547    has_decodable_text(bytes)
548        && looks_like_textual_bytes(bytes)
549        && is_textual_source_candidate(path, programming_language)
550        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
551}
552
553fn has_valid_json_text(bytes: &[u8]) -> bool {
554    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
555        return false;
556    }
557
558    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
559        || decode_utf16_bom_text(bytes)
560            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
561            .is_some()
562}
563
564fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
565    !bytes.contains(&0)
566        && !bytes.contains(&0xFF)
567        && bytes.starts_with(b"[\"")
568        && bytes.ends_with(b"\"]")
569        && bytes.len() >= 8
570}
571
572fn json_binary_override(bytes: &[u8]) -> Option<bool> {
573    if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
574        return Some(false);
575    }
576
577    if bytes.contains(&0) {
578        return Some(true);
579    }
580
581    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
582        return Some(true);
583    }
584
585    if is_wrapped_invalid_json_string_text(bytes) {
586        return Some(false);
587    }
588
589    None
590}
591
592fn detect_is_binary(
593    path: &Path,
594    bytes: &[u8],
595    detected_format: FileFormat,
596    programming_language: Option<&str>,
597) -> bool {
598    if lower_extension(path).as_deref() == Some("json")
599        && let Some(is_binary) = json_binary_override(bytes)
600    {
601        return is_binary;
602    }
603
604    if is_textual_format(detected_format) {
605        return false;
606    }
607
608    if lower_extension(path)
609        .as_deref()
610        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
611    {
612        return true;
613    }
614
615    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
616        return false;
617    }
618
619    has_binary_control_chars(bytes)
620        || is_known_binary_format(detected_format)
621        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
622            && !looks_like_textual_bytes(bytes))
623}
624
625fn should_treat_binary_bytes_as_text(
626    path: &Path,
627    bytes: &[u8],
628    programming_language: Option<&str>,
629) -> bool {
630    has_decodable_text(bytes)
631        && looks_like_textual_bytes(bytes)
632        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
633}
634
635fn detect_is_archive(
636    path: &Path,
637    bytes: &[u8],
638    mime_type: &str,
639    is_text: bool,
640    detected_format: FileFormat,
641) -> bool {
642    if is_text {
643        return false;
644    }
645
646    lower_extension(path)
647        .as_deref()
648        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
649        || matches!(
650            detected_format.kind(),
651            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
652        )
653        || is_zip_archive(bytes)
654        || looks_like_gzip(bytes)
655        || looks_like_bzip2(bytes)
656        || looks_like_xz(bytes)
657        || looks_like_deb(bytes, path)
658        || looks_like_rpm(bytes, path)
659        || looks_like_squashfs(bytes, path)
660        || mime_type.contains("zip")
661        || mime_type.contains("compressed")
662        || mime_type.contains("tar")
663        || mime_type.contains("x-rpm")
664        || mime_type.contains("debian")
665}
666
667fn detect_is_media(
668    path: &Path,
669    bytes: &[u8],
670    mime_type: &str,
671    detected_format: FileFormat,
672) -> bool {
673    media_mime_from_content(bytes).is_some()
674        || matches!(
675            detected_format.kind(),
676            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
677        )
678        || mime_type.starts_with("image/")
679        || mime_type.starts_with("audio/")
680        || mime_type.starts_with("video/")
681        || (mime_type == "application/octet-stream"
682            && lower_extension(path).as_deref() == Some("tga")
683            && !has_binary_control_chars(bytes))
684}
685
686fn detect_is_script(
687    path: &Path,
688    bytes: &[u8],
689    programming_language: Option<&str>,
690    is_text: bool,
691) -> bool {
692    if !is_text || is_makefile(path) {
693        return false;
694    }
695
696    bytes.starts_with(b"#!")
697        || lower_extension(path).as_deref().is_some_and(|ext| {
698            matches!(
699                ext,
700                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
701            )
702        })
703        || matches!(
704            programming_language,
705            Some(
706                "Shell"
707                    | "Bash"
708                    | "Zsh"
709                    | "Fish"
710                    | "Ksh"
711                    | "Python"
712                    | "Ruby"
713                    | "Perl"
714                    | "PHP"
715                    | "PowerShell"
716                    | "Awk"
717            )
718        )
719}
720
721fn detect_is_source(
722    path: &Path,
723    programming_language: Option<&str>,
724    is_text: bool,
725    is_script: bool,
726) -> bool {
727    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
728        return false;
729    }
730
731    if is_c_like_source(path) || is_java_like_source(path) {
732        return true;
733    }
734
735    programming_language.is_some() || is_script
736}
737
738#[allow(clippy::too_many_arguments)]
739fn detect_file_type(
740    path: &Path,
741    bytes: &[u8],
742    detected_format: FileFormat,
743    mime_type: &str,
744    programming_language: Option<&str>,
745    is_binary: bool,
746    is_text: bool,
747    is_archive: bool,
748    is_media: bool,
749    is_script: bool,
750) -> String {
751    if bytes.is_empty() {
752        return "empty".to_string();
753    }
754
755    if looks_like_pdf(bytes) {
756        return "PDF document".to_string();
757    }
758
759    if let Some(file_type) = media_file_type_from_content(bytes) {
760        return file_type.to_string();
761    }
762
763    if is_archive {
764        return archive_file_type(path, bytes, detected_format);
765    }
766
767    if is_script {
768        return script_file_type(programming_language, bytes);
769    }
770
771    if is_text {
772        if lower_extension(path).as_deref() == Some("json") {
773            if has_valid_json_text(bytes) {
774                return "JSON text data".to_string();
775            }
776            return text_file_type(bytes);
777        }
778        if lower_extension(path).as_deref() == Some("xml") {
779            return "XML text data".to_string();
780        }
781        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
782            return "YAML text data".to_string();
783        }
784        if lower_extension(path).as_deref() == Some("toml") {
785            return "TOML text data".to_string();
786        }
787        if matches!(
788            lower_extension(path).as_deref(),
789            Some("ini" | "cfg" | "conf")
790        ) {
791            return "INI text data".to_string();
792        }
793        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
794            return "Git configuration text".to_string();
795        }
796        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
797            return text_file_type(bytes);
798        }
799        if programming_language.is_some() && !is_media {
800            return source_file_type(programming_language, bytes);
801        }
802        return text_file_type(bytes);
803    }
804
805    if let Some(file_type) = format_based_file_type(detected_format) {
806        return file_type;
807    }
808
809    if is_binary && mime_type == "application/octet-stream" {
810        return "data".to_string();
811    }
812
813    mime_type.to_string()
814}
815
816fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
817    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
818        return true;
819    }
820
821    if matches!(
822        lower_file_name(path).as_str(),
823        "dockerfile"
824            | "containerfile"
825            | "containerfile.core"
826            | "apkbuild"
827            | "podfile"
828            | "jamfile"
829            | "jamroot"
830            | "meson.build"
831            | "build"
832            | "workspace"
833            | "buck"
834            | "default.nix"
835            | "flake.nix"
836            | "shell.nix"
837    ) {
838        return true;
839    }
840
841    path.extension()
842        .and_then(|ext| ext.to_str())
843        .is_some_and(|ext| {
844            matches!(
845                ext.to_ascii_lowercase().as_str(),
846                "rs" | "py"
847                    | "js"
848                    | "mjs"
849                    | "cjs"
850                    | "jsx"
851                    | "ts"
852                    | "mts"
853                    | "cts"
854                    | "tsx"
855                    | "c"
856                    | "cpp"
857                    | "cc"
858                    | "cxx"
859                    | "h"
860                    | "hpp"
861                    | "m"
862                    | "mm"
863                    | "s"
864                    | "asm"
865                    | "java"
866                    | "go"
867                    | "rb"
868                    | "php"
869                    | "pl"
870                    | "swift"
871                    | "sh"
872                    | "bash"
873                    | "zsh"
874                    | "fish"
875                    | "ksh"
876                    | "ps1"
877                    | "psm1"
878                    | "psd1"
879                    | "awk"
880                    | "kt"
881                    | "kts"
882                    | "dart"
883                    | "scala"
884                    | "groovy"
885                    | "gradle"
886                    | "gvy"
887                    | "gy"
888                    | "gsh"
889                    | "cs"
890                    | "fs"
891                    | "fsx"
892                    | "r"
893                    | "lua"
894                    | "jl"
895                    | "ex"
896                    | "exs"
897                    | "clj"
898                    | "cljs"
899                    | "cljc"
900                    | "hs"
901                    | "erl"
902                    | "nix"
903                    | "zig"
904                    | "bzl"
905                    | "bazel"
906                    | "star"
907                    | "sky"
908                    | "ml"
909                    | "mli"
910                    | "tex"
911            )
912        })
913}
914
915fn is_source_like_language(language: &str) -> bool {
916    matches!(
917        language,
918        "Rust"
919            | "Python"
920            | "JavaScript"
921            | "TypeScript"
922            | "JavaScript/TypeScript"
923            | "C"
924            | "C++"
925            | "Objective-C"
926            | "Objective-C++"
927            | "GAS"
928            | "Java"
929            | "Go"
930            | "Ruby"
931            | "PHP"
932            | "Perl"
933            | "Swift"
934            | "Shell"
935            | "PowerShell"
936            | "Awk"
937            | "Kotlin"
938            | "Dart"
939            | "Scala"
940            | "C#"
941            | "F#"
942            | "R"
943            | "Lua"
944            | "Julia"
945            | "Elixir"
946            | "Clojure"
947            | "Haskell"
948            | "Erlang"
949            | "Groovy"
950            | "Nix"
951            | "Zig"
952            | "Starlark"
953            | "OCaml"
954            | "Meson"
955            | "TeX"
956            | "Dockerfile"
957            | "Makefile"
958            | "Jamfile"
959    )
960}
961
962fn extension(path: &Path) -> Option<&str> {
963    path.extension().and_then(|ext| ext.to_str())
964}
965
966fn lower_extension(path: &Path) -> Option<String> {
967    extension(path).map(|ext| ext.to_ascii_lowercase())
968}
969
970fn lower_file_name(path: &Path) -> String {
971    path.file_name()
972        .and_then(|name| name.to_str())
973        .map(|name| name.to_ascii_lowercase())
974        .unwrap_or_default()
975}
976
977fn is_plain_text(path: &Path) -> bool {
978    lower_extension(path)
979        .as_deref()
980        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
981}
982
983fn is_makefile(path: &Path) -> bool {
984    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
985}
986
987fn is_source_map(path: &Path) -> bool {
988    let path_lower = path.to_string_lossy().to_ascii_lowercase();
989    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
990}
991
992fn is_c_like_source(path: &Path) -> bool {
993    lower_extension(path).as_deref().is_some_and(|ext| {
994        matches!(
995            ext,
996            "c" | "cc"
997                | "cp"
998                | "cpp"
999                | "cxx"
1000                | "c++"
1001                | "h"
1002                | "hh"
1003                | "hpp"
1004                | "hxx"
1005                | "h++"
1006                | "i"
1007                | "ii"
1008                | "m"
1009                | "s"
1010                | "asm"
1011        )
1012    })
1013}
1014
1015fn is_java_like_source(path: &Path) -> bool {
1016    lower_extension(path)
1017        .as_deref()
1018        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1019}
1020
1021fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1022    match detected_format {
1023        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1024        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1025        format => Some(match format.kind() {
1026            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1027            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1028            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1029            _ => format.name().to_string(),
1030        }),
1031    }
1032}
1033
1034fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1035    format
1036        .short_name()
1037        .map(|short_name| format!("{short_name} {suffix}"))
1038        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1039}
1040
1041fn detect_zip_like_mime(path: &Path) -> String {
1042    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1043        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1044        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1045            "application/java-archive".to_string()
1046        }
1047        _ => "application/zip".to_string(),
1048    }
1049}
1050
1051fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1052    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1053        Some("image/png")
1054    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1055        Some("image/jpeg")
1056    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1057        Some("image/tiff")
1058    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1059        Some("image/webp")
1060    } else {
1061        None
1062    }
1063}
1064
1065fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1066    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1067        Some("PNG image data")
1068    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1069        Some("JPEG image data")
1070    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1071        Some("TIFF image data")
1072    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1073        Some("WebP image data")
1074    } else {
1075        None
1076    }
1077}
1078
1079fn looks_like_pdf(bytes: &[u8]) -> bool {
1080    bytes.starts_with(b"%PDF-")
1081}
1082
1083fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1084    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1085}
1086
1087fn extract_rtf_text(bytes: &[u8]) -> String {
1088    let text = String::from_utf8_lossy(bytes);
1089    let chars: Vec<char> = text.chars().collect();
1090    let mut output = String::new();
1091    let mut index = 0usize;
1092
1093    while index < chars.len() {
1094        match chars[index] {
1095            '{' | '}' => {
1096                index += 1;
1097            }
1098            '\\' => {
1099                index += 1;
1100                if index >= chars.len() {
1101                    break;
1102                }
1103
1104                match chars[index] {
1105                    '\\' | '{' | '}' => {
1106                        output.push(chars[index]);
1107                        index += 1;
1108                    }
1109                    '\'' => {
1110                        if index + 2 < chars.len() {
1111                            let hex = [chars[index + 1], chars[index + 2]];
1112                            let hex: String = hex.iter().collect();
1113                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1114                                output.push(value as char);
1115                                index += 3;
1116                                continue;
1117                            }
1118                        }
1119                        index += 1;
1120                    }
1121                    control if control.is_ascii_alphabetic() => {
1122                        let start = index;
1123                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1124                            index += 1;
1125                        }
1126                        let control_word: String = chars[start..index].iter().collect();
1127
1128                        let number_start = index;
1129                        if index < chars.len()
1130                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1131                        {
1132                            index += 1;
1133                            while index < chars.len() && chars[index].is_ascii_digit() {
1134                                index += 1;
1135                            }
1136                        }
1137                        let parameter: String = chars[number_start..index].iter().collect();
1138
1139                        if index < chars.len() && chars[index] == ' ' {
1140                            index += 1;
1141                        }
1142
1143                        match control_word.as_str() {
1144                            "par" | "line" => output.push('\n'),
1145                            "tab" => output.push('\t'),
1146                            "emdash" => output.push('—'),
1147                            "endash" => output.push('–'),
1148                            "bullet" => output.push('•'),
1149                            "lquote" | "rquote" => output.push('\''),
1150                            "ldblquote" | "rdblquote" => output.push('"'),
1151                            "u" => {
1152                                if let Ok(codepoint) = parameter.parse::<i32>() {
1153                                    let normalized = if codepoint < 0 {
1154                                        codepoint + 65_536
1155                                    } else {
1156                                        codepoint
1157                                    };
1158                                    if let Ok(normalized) = u32::try_from(normalized)
1159                                        && let Some(ch) = char::from_u32(normalized)
1160                                    {
1161                                        output.push(ch);
1162                                    }
1163                                }
1164
1165                                if index < chars.len()
1166                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1167                                {
1168                                    index += 1;
1169                                }
1170                            }
1171                            _ => {}
1172                        }
1173                    }
1174                    _ => {
1175                        index += 1;
1176                    }
1177                }
1178            }
1179            ch => {
1180                output.push(ch);
1181                index += 1;
1182            }
1183        }
1184    }
1185
1186    output
1187        .replace(['\r', '\u{0c}'], "\n")
1188        .lines()
1189        .map(str::trim_end)
1190        .collect::<Vec<_>>()
1191        .join("\n")
1192}
1193
1194fn looks_like_gzip(bytes: &[u8]) -> bool {
1195    bytes.starts_with(&[0x1f, 0x8b])
1196}
1197
1198fn looks_like_bzip2(bytes: &[u8]) -> bool {
1199    bytes.starts_with(b"BZh")
1200}
1201
1202fn looks_like_xz(bytes: &[u8]) -> bool {
1203    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1204}
1205
1206fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1207    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1208}
1209
1210fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1211    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1212}
1213
1214fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1215    lower_extension(path)
1216        .as_deref()
1217        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1218        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1219            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1220}
1221
1222fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1223    if looks_like_deb(bytes, path) {
1224        "debian binary package (format 2.0)".to_string()
1225    } else if looks_like_rpm(bytes, path) {
1226        "RPM package".to_string()
1227    } else if looks_like_squashfs(bytes, path) {
1228        "Squashfs filesystem".to_string()
1229    } else if looks_like_gzip(bytes) {
1230        "gzip compressed data".to_string()
1231    } else if looks_like_bzip2(bytes) {
1232        "bzip2 compressed data".to_string()
1233    } else if looks_like_xz(bytes) {
1234        "XZ compressed data".to_string()
1235    } else if is_zip_archive(bytes) {
1236        "Zip archive data".to_string()
1237    } else if lower_extension(path).as_deref() == Some("gem") {
1238        "POSIX tar archive".to_string()
1239    } else if let Some(file_type) = format_based_file_type(detected_format) {
1240        file_type
1241    } else {
1242        "archive data".to_string()
1243    }
1244}
1245
1246fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1247    let suffix = text_executable_label(bytes);
1248
1249    match programming_language {
1250        Some("Python") => format!("python script, {suffix}"),
1251        Some("Ruby") => format!("ruby script, {suffix}"),
1252        Some("Perl") => format!("perl script, {suffix}"),
1253        Some("PHP") => format!("php script, {suffix}"),
1254        Some("Shell") => format!("shell script, {suffix}"),
1255        Some("Bash") => format!("bash script, {suffix}"),
1256        Some("Zsh") => format!("zsh script, {suffix}"),
1257        Some("Fish") => format!("fish script, {suffix}"),
1258        Some("Ksh") => format!("ksh script, {suffix}"),
1259        Some("JavaScript") => format!("javascript script, {suffix}"),
1260        Some("TypeScript") => format!("typescript script, {suffix}"),
1261        Some("PowerShell") => format!("powershell script, {suffix}"),
1262        Some("Awk") => format!("awk script, {suffix}"),
1263        _ => format!("script, {suffix}"),
1264    }
1265}
1266
1267fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1268    let suffix = text_label(bytes);
1269    match programming_language {
1270        Some("C") => format!("C source, {suffix}"),
1271        Some("C++") => format!("C++ source, {suffix}"),
1272        Some("Java") => format!("Java source, {suffix}"),
1273        Some("C#") => format!("C# source, {suffix}"),
1274        Some("F#") => format!("F# source, {suffix}"),
1275        Some("Go") => format!("Go source, {suffix}"),
1276        Some("Rust") => format!("Rust source, {suffix}"),
1277        Some("Starlark") => format!("Starlark source, {suffix}"),
1278        Some("CMake") => format!("CMake source, {suffix}"),
1279        Some("Meson") => format!("Meson source, {suffix}"),
1280        Some("Nix") => format!("Nix source, {suffix}"),
1281        Some("Groovy") => format!("Groovy source, {suffix}"),
1282        Some("Makefile") => format!("Makefile source, {suffix}"),
1283        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1284        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1285        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1286        Some(language) => format!("{language} source, {suffix}"),
1287        None => text_file_type(bytes),
1288    }
1289}
1290
1291fn text_file_type(bytes: &[u8]) -> String {
1292    text_label(bytes).to_string()
1293}
1294
1295fn text_label(bytes: &[u8]) -> &'static str {
1296    if std::str::from_utf8(bytes).is_ok() {
1297        if bytes.contains(&b'\n') {
1298            "UTF-8 Unicode text"
1299        } else {
1300            "UTF-8 Unicode text, with no line terminators"
1301        }
1302    } else if bytes.contains(&b'\n') {
1303        "text"
1304    } else {
1305        "text, with no line terminators"
1306    }
1307}
1308
1309fn text_executable_label(bytes: &[u8]) -> &'static str {
1310    if std::str::from_utf8(bytes).is_ok() {
1311        if bytes.contains(&b'\n') {
1312            "UTF-8 Unicode text executable"
1313        } else {
1314            "UTF-8 Unicode text executable, with no line terminators"
1315        }
1316    } else if bytes.contains(&b'\n') {
1317        "text executable"
1318    } else {
1319        "text executable, with no line terminators"
1320    }
1321}
1322
1323fn supported_image_metadata_format(
1324    ext: Option<&str>,
1325    detected_format: FileFormat,
1326) -> Option<ImageFormat> {
1327    match ext {
1328        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1329        Some("png") => Some(ImageFormat::Png),
1330        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1331        Some("webp") => Some(ImageFormat::WebP),
1332        _ => match detected_format.media_type() {
1333            "image/jpeg" => Some(ImageFormat::Jpeg),
1334            "image/png" => Some(ImageFormat::Png),
1335            "image/tiff" => Some(ImageFormat::Tiff),
1336            "image/webp" => Some(ImageFormat::WebP),
1337            _ => None,
1338        },
1339    }
1340}
1341
1342fn should_skip_binary_string_extraction(
1343    path: &Path,
1344    bytes: &[u8],
1345    detected_format: FileFormat,
1346) -> bool {
1347    matches!(lower_extension(path).as_deref(), Some("pdf"))
1348        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1349            .is_some()
1350        || (matches!(
1351            detected_format.kind(),
1352            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1353        ) && !is_textual_format(detected_format))
1354        || media_mime_from_content(bytes).is_some()
1355        || is_zip_archive(bytes)
1356        || looks_like_gzip(bytes)
1357        || looks_like_bzip2(bytes)
1358        || looks_like_xz(bytes)
1359        || looks_like_deb(bytes, path)
1360        || looks_like_rpm(bytes, path)
1361        || looks_like_squashfs(bytes, path)
1362}
1363
1364fn should_skip_large_opaque_binary_text_extraction(
1365    _path: &Path,
1366    bytes: &[u8],
1367    detected_format: FileFormat,
1368) -> bool {
1369    is_large_opaque_binary_candidate(bytes, detected_format)
1370        && !sample_has_promising_printable_strings(bytes)
1371}
1372
1373fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1374    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1375        && !is_textual_format(detected_format)
1376        && !matches!(
1377            detected_format.kind(),
1378            FileFormatKind::Archive
1379                | FileFormatKind::Compressed
1380                | FileFormatKind::Package
1381                | FileFormatKind::Audio
1382                | FileFormatKind::Image
1383                | FileFormatKind::Video
1384        )
1385}
1386
1387fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1388    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1389
1390    let mut ranges = Vec::new();
1391    let mut push_range = |start: usize, end: usize| {
1392        if start < end && !ranges.contains(&(start, end)) {
1393            ranges.push((start, end));
1394        }
1395    };
1396
1397    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1398    if len > SAMPLE_WINDOW_BYTES * 2 {
1399        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1400        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1401        push_range(mid_start, mid_end);
1402    }
1403    if len > SAMPLE_WINDOW_BYTES {
1404        push_range(len - SAMPLE_WINDOW_BYTES, len);
1405    }
1406
1407    ranges
1408}
1409
1410fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1411    let mut structured_signal_seen = false;
1412    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1413        .into_iter()
1414        .filter(|&(start, end)| {
1415            let window = &bytes[start..end];
1416            if has_strong_structured_text_signal(window) {
1417                structured_signal_seen = true;
1418            }
1419            has_license_or_notice_signal(window)
1420        })
1421        .count();
1422
1423    structured_signal_seen || promising_license_windows >= 2
1424}
1425
1426fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1427    let mut combined_lines = BTreeSet::new();
1428
1429    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1430        let window_text = extract_printable_strings(&bytes[start..end]);
1431        for line in window_text
1432            .lines()
1433            .map(str::trim)
1434            .filter(|line| !line.is_empty())
1435        {
1436            combined_lines.insert(line.to_string());
1437        }
1438    }
1439
1440    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1441}
1442
1443fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1444    let strings = extract_printable_strings(bytes);
1445    if strings.is_empty() {
1446        return false;
1447    }
1448
1449    let lower = strings.to_ascii_lowercase();
1450    [
1451        "copyright",
1452        "license",
1453        "licensed under",
1454        "all rights reserved",
1455        "permission is hereby granted",
1456        "redistribution and use",
1457        "spdx-license-identifier",
1458    ]
1459    .iter()
1460    .any(|marker| lower.contains(marker))
1461}
1462
1463fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1464    let strings = extract_printable_strings(bytes);
1465    if strings.is_empty() {
1466        return false;
1467    }
1468
1469    let email_markers = strings.matches('@').count();
1470    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1471
1472    email_markers + url_markers >= 3
1473}
1474
1475fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1476    match format {
1477        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1478        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1479        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1480        ImageFormat::WebP => {
1481            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1482        }
1483        _ => false,
1484    }
1485}
1486
1487fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1488    let mut values = Vec::new();
1489    values.extend(extract_exif_metadata_values(bytes));
1490    values.extend(extract_xmp_metadata_values(bytes, format));
1491    values_to_text(values)
1492}
1493
1494fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1495    let mut cursor = BufReader::new(Cursor::new(bytes));
1496    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1497        Ok(exif) => exif,
1498        Err(_) => return Vec::new(),
1499    };
1500
1501    let mut values = Vec::new();
1502    for field in exif.fields() {
1503        let rendered = match field.tag {
1504            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1505                Some(field.display_value().with_unit(&exif).to_string())
1506            }
1507            exif::Tag::Artist => Some(format!(
1508                "Author: {}",
1509                field.display_value().with_unit(&exif)
1510            )),
1511            _ => None,
1512        };
1513
1514        if let Some(rendered) = rendered {
1515            values.push(rendered);
1516        }
1517    }
1518
1519    values
1520}
1521
1522fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1523    let xmp = match extract_raw_xmp_packet(bytes, format) {
1524        Some(xmp) => xmp,
1525        None => return Vec::new(),
1526    };
1527
1528    parse_xmp_values(&xmp)
1529}
1530
1531fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1532    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1533    if let Ok(mut decoder) = reader.into_decoder()
1534        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1535    {
1536        return Some(xmp);
1537    }
1538
1539    match format {
1540        ImageFormat::Png => extract_png_xmp_packet(bytes),
1541        _ => None,
1542    }
1543}
1544
1545fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1546    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1547
1548    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1549        return None;
1550    }
1551
1552    let mut offset = PNG_SIGNATURE.len();
1553    while offset + 12 <= bytes.len() {
1554        let length = u32::from_be_bytes([
1555            bytes[offset],
1556            bytes[offset + 1],
1557            bytes[offset + 2],
1558            bytes[offset + 3],
1559        ]) as usize;
1560        let chunk_start = offset + 8;
1561        let chunk_end = chunk_start + length;
1562        if chunk_end + 4 > bytes.len() {
1563            return None;
1564        }
1565
1566        let chunk_type = &bytes[offset + 4..offset + 8];
1567        if chunk_type == b"iTXt" {
1568            let data = &bytes[chunk_start..chunk_end];
1569            if let Some(xmp) = parse_png_itxt_xmp(data) {
1570                return Some(xmp);
1571            }
1572        }
1573
1574        offset = chunk_end + 4;
1575    }
1576
1577    None
1578}
1579
1580fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1581    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1582
1583    let keyword_end = data.iter().position(|&b| b == 0)?;
1584    if &data[..keyword_end] != XMP_KEYWORD {
1585        return None;
1586    }
1587
1588    let mut cursor = keyword_end + 1;
1589    let compression_flag = *data.get(cursor)?;
1590    cursor += 1;
1591    let compression_method = *data.get(cursor)?;
1592    cursor += 1;
1593    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1594        return None;
1595    }
1596
1597    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1598    cursor = language_end + 1;
1599
1600    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1601    cursor = translated_end + 1;
1602
1603    let text_bytes = &data[cursor..];
1604    if compression_flag == 1 {
1605        let mut decoder = ZlibDecoder::new(text_bytes);
1606        let mut decoded = Vec::new();
1607        decoder.read_to_end(&mut decoded).ok()?;
1608        Some(decoded)
1609    } else {
1610        Some(text_bytes.to_vec())
1611    }
1612}
1613
1614fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1615    let mut reader = XmlReader::from_reader(xmp);
1616    reader.config_mut().trim_text(true);
1617
1618    let mut buf = Vec::new();
1619    let mut stack: Vec<String> = Vec::new();
1620    let mut values = Vec::new();
1621
1622    loop {
1623        match reader.read_event_into(&mut buf) {
1624            Ok(Event::Start(e)) => {
1625                stack.push(local_xml_name(e.name().as_ref()));
1626            }
1627            Ok(Event::End(_)) => {
1628                stack.pop();
1629            }
1630            Ok(Event::Empty(_)) => {}
1631            Ok(Event::Text(text)) => {
1632                if let Some(field) = stack
1633                    .iter()
1634                    .rev()
1635                    .find_map(|name| allowed_xmp_field(name.as_str()))
1636                    && let Ok(decoded) = text.decode()
1637                {
1638                    let decoded = decoded.into_owned();
1639                    if !decoded.trim().is_empty() {
1640                        values.push(format_xmp_value(field, &decoded));
1641                    }
1642                }
1643            }
1644            Ok(Event::CData(text)) => {
1645                if let Some(field) = stack
1646                    .iter()
1647                    .rev()
1648                    .find_map(|name| allowed_xmp_field(name.as_str()))
1649                    && let Ok(decoded) = text.decode()
1650                {
1651                    let decoded = decoded.into_owned();
1652                    if !decoded.trim().is_empty() {
1653                        values.push(format_xmp_value(field, &decoded));
1654                    }
1655                }
1656            }
1657            Ok(Event::Eof) | Err(_) => break,
1658            _ => {}
1659        }
1660        buf.clear();
1661    }
1662
1663    values
1664}
1665
1666fn local_xml_name(name: &[u8]) -> String {
1667    let name = std::str::from_utf8(name).unwrap_or_default();
1668    name.rsplit(':').next().unwrap_or(name).to_string()
1669}
1670
1671fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1672    match name {
1673        "creator" => Some("creator"),
1674        "rights" => Some("rights"),
1675        "description" => Some("description"),
1676        "title" => Some("title"),
1677        "subject" => Some("subject"),
1678        "UsageTerms" => Some("usage_terms"),
1679        "WebStatement" => Some("web_statement"),
1680        _ => None,
1681    }
1682}
1683
1684fn format_xmp_value(field: &str, value: &str) -> String {
1685    match field {
1686        "creator" => format!("Author: {value}"),
1687        _ => value.to_string(),
1688    }
1689}
1690
1691fn values_to_text(values: Vec<String>) -> String {
1692    let mut seen = BTreeSet::new();
1693    let mut lines = Vec::new();
1694    let mut total_bytes = 0usize;
1695
1696    for value in values {
1697        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1698            break;
1699        }
1700
1701        let normalized = normalize_metadata_value(&value);
1702        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1703            continue;
1704        }
1705
1706        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1707        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1708            break;
1709        }
1710
1711        total_bytes += added_bytes;
1712        lines.push(normalized);
1713    }
1714
1715    lines.join("\n")
1716}
1717
1718fn normalize_metadata_value(value: &str) -> String {
1719    value
1720        .chars()
1721        .filter(|&ch| ch != '\0')
1722        .collect::<String>()
1723        .split_whitespace()
1724        .collect::<Vec<_>>()
1725        .join(" ")
1726        .trim()
1727        .to_string()
1728}
1729
1730fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1731    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1732        return (String::new(), None);
1733    }
1734
1735    let mut failures = Vec::new();
1736    let mut saw_success = false;
1737
1738    let extracted = catch_unwind(AssertUnwindSafe(
1739        || -> Result<String, Box<dyn std::error::Error>> {
1740            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1741            extract_first_pdf_page_text(&mut document)
1742        },
1743    ));
1744    match extracted {
1745        Ok(Ok(text)) => {
1746            saw_success = true;
1747            if let Some(normalized) = normalize_pdf_text(text) {
1748                return (normalized, None);
1749            }
1750        }
1751        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1752        Err(payload) => failures.push(format!(
1753            "from-bytes first-page panic: {}",
1754            panic_payload_to_string(payload.as_ref())
1755        )),
1756    }
1757
1758    let extracted = catch_unwind(AssertUnwindSafe(
1759        || -> Result<String, Box<dyn std::error::Error>> {
1760            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1761            extract_pdf_text_from_document(&mut document)
1762        },
1763    ));
1764    match extracted {
1765        Ok(Ok(text)) => {
1766            saw_success = true;
1767            if let Some(normalized) = normalize_pdf_text(text) {
1768                return (normalized, None);
1769            }
1770        }
1771        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1772        Err(payload) => failures.push(format!(
1773            "open full-document panic: {}",
1774            panic_payload_to_string(payload.as_ref())
1775        )),
1776    }
1777
1778    let extracted = catch_unwind(AssertUnwindSafe(
1779        || -> Result<String, Box<dyn std::error::Error>> {
1780            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1781            extract_pdf_text_from_document(&mut document)
1782        },
1783    ));
1784    match extracted {
1785        Ok(Ok(text)) => {
1786            saw_success = true;
1787            if let Some(normalized) = normalize_pdf_text(text) {
1788                return (normalized, None);
1789            }
1790        }
1791        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1792        Err(payload) => failures.push(format!(
1793            "from-bytes full-document panic: {}",
1794            panic_payload_to_string(payload.as_ref())
1795        )),
1796    }
1797
1798    if saw_success || is_non_actionable_pdf_failure(&failures) {
1799        (String::new(), None)
1800    } else {
1801        (
1802            String::new(),
1803            Some(format!(
1804                "PDF text extraction failed after {} attempts: {}",
1805                failures.len(),
1806                failures.join("; ")
1807            )),
1808        )
1809    }
1810}
1811
1812fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1813    !failures.is_empty()
1814        && failures.iter().all(|failure| {
1815            failure.contains("requires a password")
1816                || failure.contains("Encrypt dictionary missing /O")
1817                || failure.contains("Encrypt dictionary missing /U")
1818                || failure.contains("security handler cannot be found")
1819                || failure.contains("Invalid cross-reference table")
1820        })
1821}
1822
1823fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1824    if let Some(message) = payload.downcast_ref::<&str>() {
1825        (*message).to_string()
1826    } else if let Some(message) = payload.downcast_ref::<String>() {
1827        message.clone()
1828    } else {
1829        "unknown panic payload".to_string()
1830    }
1831}
1832
1833fn extract_first_pdf_page_text(
1834    document: &mut pdf_oxide::document::PdfDocument,
1835) -> Result<String, Box<dyn std::error::Error>> {
1836    if document.page_count()? == 0 {
1837        return Ok(String::new());
1838    }
1839
1840    let extracted_text = document.extract_text(0)?;
1841    let markdown_text =
1842        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1843    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1844        return Ok(extracted_text);
1845    }
1846
1847    let pipeline_text =
1848        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1849
1850    Ok(merge_pdf_first_page_text(
1851        &extracted_text,
1852        &markdown_text,
1853        &pipeline_text,
1854    ))
1855}
1856
1857fn extract_pdf_text_from_document(
1858    document: &mut pdf_oxide::document::PdfDocument,
1859) -> Result<String, Box<dyn std::error::Error>> {
1860    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1861}
1862
1863fn normalize_pdf_text(text: String) -> Option<String> {
1864    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1865    (!normalized.trim().is_empty()).then_some(normalized)
1866}
1867
1868fn merge_pdf_first_page_text(
1869    _extracted_text: &str,
1870    markdown_text: &str,
1871    pipeline_text: &str,
1872) -> String {
1873    let pipeline = pipeline_text.trim();
1874    if pipeline.is_empty() {
1875        return String::new();
1876    }
1877
1878    let prefix = pdf_first_page_heading_prefix(markdown_text);
1879    let Some(prefix) = prefix else {
1880        return pipeline_text.to_string();
1881    };
1882
1883    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1884        pipeline_text.to_string()
1885    } else {
1886        format!("{prefix}\n\n{pipeline}")
1887    }
1888}
1889
1890fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1891    normalize_pdf_heading_comparison_text(text)
1892        .contains(&normalize_pdf_heading_comparison_text(prefix))
1893}
1894
1895fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1896    text.split_whitespace()
1897        .map(|part| part.to_ascii_lowercase())
1898        .collect::<Vec<_>>()
1899        .join(" ")
1900}
1901
1902fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1903    let mut lines = Vec::new();
1904
1905    for line in pdf_markdown_heading_lines(markdown_text) {
1906        push_unique_line(&mut lines, line);
1907    }
1908
1909    (!lines.is_empty()).then(|| lines.join("\n"))
1910}
1911
1912fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1913    text.lines()
1914        .map(str::trim)
1915        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1916        .map(|line| line.trim_matches('#').trim())
1917        .filter(|line| !line.is_empty())
1918        .filter(|line| !looks_like_numbered_section_heading(line))
1919        .take(4)
1920        .map(ToOwned::to_owned)
1921        .collect()
1922}
1923
1924fn push_unique_line(lines: &mut Vec<String>, line: String) {
1925    if !lines.iter().any(|existing| existing == &line) {
1926        lines.push(line);
1927    }
1928}
1929
1930fn looks_like_numbered_section_heading(line: &str) -> bool {
1931    let mut chars = line.chars();
1932    let Some(first) = chars.next() else {
1933        return false;
1934    };
1935
1936    if !first.is_ascii_digit() {
1937        return false;
1938    }
1939
1940    matches!(chars.next(), Some('.'))
1941}
1942
1943fn is_zip_archive(bytes: &[u8]) -> bool {
1944    bytes.starts_with(b"PK\x03\x04")
1945        || bytes.starts_with(b"PK\x05\x06")
1946        || bytes.starts_with(b"PK\x07\x08")
1947}
1948
1949pub fn extract_printable_strings(bytes: &[u8]) -> String {
1950    const MIN_LEN: usize = 4;
1951    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1952    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1953
1954    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1955
1956    fn is_printable_ascii(b: u8) -> bool {
1957        matches!(b, 0x20..=0x7E)
1958    }
1959
1960    let mut out = String::new();
1961    let mut run: Vec<u8> = Vec::new();
1962
1963    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1964        if run.len() >= MIN_LEN {
1965            if !out.is_empty() {
1966                out.push('\n');
1967            }
1968            out.push_str(&String::from_utf8_lossy(run));
1969        }
1970        run.clear();
1971    };
1972
1973    for &b in bytes {
1974        if is_printable_ascii(b) {
1975            run.push(b);
1976        } else {
1977            flush_run(&mut out, &mut run);
1978            if out.len() >= max_output_bytes {
1979                return out;
1980            }
1981        }
1982    }
1983    flush_run(&mut out, &mut run);
1984    if out.len() >= max_output_bytes {
1985        return out;
1986    }
1987
1988    for start in 0..=1 {
1989        run.clear();
1990        let mut i = start;
1991        while i + 1 < bytes.len() {
1992            let b0 = bytes[i];
1993            let b1 = bytes[i + 1];
1994            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1995            if is_printable_ascii(ch) && zero == 0 {
1996                run.push(ch);
1997            } else {
1998                flush_run(&mut out, &mut run);
1999                if out.len() >= max_output_bytes {
2000                    return out;
2001                }
2002            }
2003            i += 2;
2004        }
2005        flush_run(&mut out, &mut run);
2006        if out.len() >= max_output_bytes {
2007            return out;
2008        }
2009    }
2010
2011    out
2012}
2013
2014#[cfg(test)]
2015mod tests {
2016    use std::path::Path;
2017
2018    use super::{
2019        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2020        extract_printable_strings, extract_text_for_detection,
2021        extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2022        normalize_mime_type, normalize_pdf_heading_comparison_text,
2023        windows_metadata_or_empty_result,
2024    };
2025
2026    #[test]
2027    fn test_extract_text_for_detection_skips_jar_archives() {
2028        let path = Path::new(
2029            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2030        );
2031        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2032
2033        let (text, kind) = extract_text_for_detection(path, &bytes);
2034
2035        assert!(text.is_empty());
2036        assert_eq!(kind, ExtractedTextKind::None);
2037    }
2038
2039    #[test]
2040    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2041        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2042        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2043
2044        let (text, kind) = extract_text_for_detection(path, &bytes);
2045
2046        assert_eq!(kind, ExtractedTextKind::Pdf);
2047        assert!(text.contains("Redistribution and use in source and binary forms"));
2048    }
2049
2050    #[test]
2051    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2052        let path =
2053            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2054        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2055
2056        let (text, kind) = extract_text_for_detection(path, &bytes);
2057
2058        assert_eq!(kind, ExtractedTextKind::Pdf);
2059        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2060        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2061    }
2062
2063    #[test]
2064    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2065        let path =
2066            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2067        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2068
2069        let (text, kind) = extract_text_for_detection(path, &bytes);
2070
2071        assert_eq!(kind, ExtractedTextKind::Pdf);
2072
2073        let normalized = normalize_pdf_heading_comparison_text(&text);
2074        let heading =
2075            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2076        assert_eq!(normalized.matches(&heading).count(), 1);
2077    }
2078
2079    #[test]
2080    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2081        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2082        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2083
2084        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2085
2086        assert_eq!(kind, ExtractedTextKind::Pdf);
2087        assert!(text.contains("Redistribution and use in source and binary forms"));
2088    }
2089
2090    #[test]
2091    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2092        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2093
2094        let (text, kind, scan_error) =
2095            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2096
2097        assert!(text.is_empty());
2098        assert_eq!(kind, ExtractedTextKind::None);
2099        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2100        assert!(scan_error.contains("PDF text extraction failed after"));
2101    }
2102
2103    #[test]
2104    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2105        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2106
2107        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2108
2109        assert!(text.is_empty());
2110        assert_eq!(kind, ExtractedTextKind::None);
2111    }
2112
2113    #[test]
2114    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2115        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2116        let text = b"Copyright 2026 Example Project!!!";
2117        bytes[..text.len()].copy_from_slice(text);
2118        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2119        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2120
2121        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2122
2123        assert_ne!(kind, ExtractedTextKind::None);
2124        assert!(text.contains("Copyright 2026 Example Project"));
2125    }
2126
2127    #[test]
2128    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2129        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2130        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2131        bytes[..noise.len()].copy_from_slice(noise);
2132        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2133        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2134
2135        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2136
2137        assert!(text.is_empty());
2138        assert_eq!(kind, ExtractedTextKind::None);
2139    }
2140
2141    #[test]
2142    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2143        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2144        let bytes = std::fs::read(path).expect("read PE fixture");
2145
2146        let (text, kind) = extract_text_for_detection(path, &bytes);
2147
2148        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2149        assert!(text.contains("License: This program is free software"));
2150        assert!(text.contains("LegalCopyright:"));
2151    }
2152
2153    #[test]
2154    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2155    {
2156        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2157        let mut bytes = std::fs::read(path).expect("read PE fixture");
2158        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2159
2160        let (text, kind) = extract_text_for_detection(path, &bytes);
2161
2162        assert_ne!(kind, ExtractedTextKind::None);
2163        assert!(!text.trim().is_empty());
2164    }
2165
2166    #[test]
2167    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2168        let (text, kind, scan_error) =
2169            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2170
2171        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2172        assert_eq!(text, "LegalCopyright: Example Corp");
2173        assert!(scan_error.is_none());
2174    }
2175
2176    #[test]
2177    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2178        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2179        let text = b"Copyright 2026 Example Project!!!";
2180        bytes[..text.len()].copy_from_slice(text);
2181
2182        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2183
2184        assert!(text.is_empty());
2185        assert_eq!(kind, ExtractedTextKind::None);
2186    }
2187
2188    #[test]
2189    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2190        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2191        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2192        bytes[..text.len()].copy_from_slice(text);
2193
2194        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2195
2196        assert_ne!(kind, ExtractedTextKind::None);
2197        assert!(text.contains("asn@redhat.com"));
2198        assert!(text.contains("https://publicsuffix.org/"));
2199    }
2200
2201    #[test]
2202    fn test_non_actionable_pdf_failures_are_suppressed() {
2203        assert!(is_non_actionable_pdf_failure(&[
2204            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2205            "open full-document: PDF is encrypted and requires a password".to_string(),
2206        ]));
2207        assert!(is_non_actionable_pdf_failure(&[
2208            "from-bytes first-page: Invalid cross-reference table".to_string(),
2209            "open full-document: Invalid cross-reference table".to_string(),
2210        ]));
2211        assert!(is_non_actionable_pdf_failure(&[
2212            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2213            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2214        ]));
2215        assert!(!is_non_actionable_pdf_failure(&[
2216            "from-bytes first-page: some other parser failure".to_string(),
2217        ]));
2218    }
2219
2220    #[test]
2221    fn test_extract_text_for_detection_skips_zip_like_archives() {
2222        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2223
2224        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2225        let (crate_text, crate_kind) =
2226            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2227
2228        assert!(whl_text.is_empty());
2229        assert_eq!(whl_kind, ExtractedTextKind::None);
2230        assert!(crate_text.is_empty());
2231        assert_eq!(crate_kind, ExtractedTextKind::None);
2232    }
2233
2234    #[test]
2235    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2236        let path =
2237            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2238        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2239
2240        let (text, kind) = extract_text_for_detection(path, &bytes);
2241
2242        assert_ne!(kind, ExtractedTextKind::None);
2243        assert!(text.contains("Copyright nexB and others (c) 2012"));
2244    }
2245
2246    #[test]
2247    fn test_extract_text_for_detection_reads_font_metadata() {
2248        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2249        let bytes = std::fs::read(path).expect("failed to read font fixture");
2250
2251        let (text, kind) = extract_text_for_detection(path, &bytes);
2252
2253        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2254        assert!(text.contains("License Description:"), "{text}");
2255        assert!(
2256            text.contains("Open Font License") || text.contains("OFL"),
2257            "{text}"
2258        );
2259        assert!(text.contains("Lato"), "{text}");
2260    }
2261
2262    #[test]
2263    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2264        let bytes = b"abcd\0".repeat(525_000);
2265
2266        let text = extract_printable_strings(&bytes);
2267
2268        assert!(
2269            text.len() > 2_000_000,
2270            "unexpected truncation at {}",
2271            text.len()
2272        );
2273        assert!(text.ends_with("abcd"));
2274    }
2275
2276    #[test]
2277    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2278        let path = Path::new(
2279            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2280        );
2281        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2282
2283        let (text, kind) = extract_text_for_detection(path, &bytes);
2284
2285        assert_eq!(kind, ExtractedTextKind::Decoded);
2286        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2287    }
2288
2289    #[test]
2290    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2291        let path = Path::new(
2292            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2293        );
2294        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2295
2296        let (text, kind) = extract_text_for_detection(path, &bytes);
2297
2298        assert_eq!(kind, ExtractedTextKind::Decoded);
2299        assert!(text.contains("GNU Lesser General Public"));
2300        assert!(text.contains("version"));
2301        assert!(text.contains("2.1 of the License"));
2302    }
2303
2304    #[test]
2305    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2306        assert_eq!(
2307            normalize_mime_type(
2308                Path::new("main.ts"),
2309                b"export const answer = 42;\n",
2310                Some("TypeScript"),
2311                "video/mp2t",
2312            ),
2313            "text/plain"
2314        );
2315    }
2316
2317    #[test]
2318    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2319        assert_eq!(
2320            normalize_mime_type(
2321                Path::new("main.js"),
2322                b"console.log('hello');\n",
2323                Some("JavaScript"),
2324                "application/octet-stream",
2325            ),
2326            "text/plain"
2327        );
2328    }
2329
2330    #[test]
2331    fn test_normalize_mime_type_preserves_binary_video_guess() {
2332        assert_eq!(
2333            normalize_mime_type(
2334                Path::new("main.ts"),
2335                &[0, 159, 146, 150, 0, 1, 2, 3],
2336                Some("TypeScript"),
2337                "video/mp2t",
2338            ),
2339            "video/mp2t"
2340        );
2341    }
2342
2343    #[test]
2344    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2345        assert_eq!(
2346            normalize_mime_type(
2347                Path::new("main.ts"),
2348                &[0, 159, 146, 150],
2349                Some("TypeScript"),
2350                "application/octet-stream",
2351            ),
2352            "application/octet-stream"
2353        );
2354    }
2355
2356    #[test]
2357    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2358        let classification = classify_file_info(Path::new("test.txt"), b"");
2359
2360        assert_eq!(classification.mime_type, "inode/x-empty");
2361        assert_eq!(classification.file_type, "empty");
2362        assert!(!classification.is_binary);
2363        assert!(classification.is_text);
2364        assert!(!classification.is_source);
2365        assert_eq!(classification.programming_language, None);
2366    }
2367
2368    #[test]
2369    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2370        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2371
2372        assert_eq!(classification.mime_type, "application/json");
2373        assert_eq!(classification.file_type, "JSON text data");
2374        assert!(classification.is_text);
2375        assert!(!classification.is_source);
2376        assert_eq!(classification.programming_language, None);
2377    }
2378
2379    #[test]
2380    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2381        let classification =
2382            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2383
2384        assert_eq!(classification.mime_type, "text/plain");
2385        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2386        assert!(classification.is_text);
2387        assert!(!classification.is_binary);
2388    }
2389
2390    #[test]
2391    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2392        let classification =
2393            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2394
2395        assert_eq!(classification.mime_type, "application/octet-stream");
2396        assert_eq!(classification.file_type, "data");
2397        assert!(classification.is_binary);
2398        assert!(!classification.is_text);
2399    }
2400
2401    #[test]
2402    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2403        let classification = classify_file_info(
2404            Path::new("utf16.json"),
2405            &[
2406                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2407            ],
2408        );
2409
2410        assert!(!classification.is_binary);
2411        assert!(classification.is_text);
2412        assert_eq!(classification.mime_type, "application/json");
2413        assert_eq!(classification.file_type, "JSON text data");
2414    }
2415
2416    #[test]
2417    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2418        let classification = classify_file_info(Path::new("true.json"), b"true");
2419
2420        assert!(!classification.is_binary);
2421        assert!(classification.is_text);
2422        assert_eq!(classification.mime_type, "application/json");
2423        assert_eq!(classification.file_type, "JSON text data");
2424    }
2425
2426    #[test]
2427    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2428        let classification = classify_file_info(
2429            Path::new("wrapped.json"),
2430            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2431        );
2432
2433        assert!(!classification.is_binary);
2434        assert!(classification.is_text);
2435        assert_eq!(classification.mime_type, "text/plain");
2436        assert_eq!(classification.file_type, "text, with no line terminators");
2437    }
2438
2439    #[test]
2440    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2441        let classification =
2442            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2443
2444        assert!(classification.is_binary);
2445        assert!(!classification.is_text);
2446        assert_eq!(classification.mime_type, "application/octet-stream");
2447        assert_eq!(classification.file_type, "data");
2448    }
2449
2450    #[test]
2451    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2452        let classification = classify_file_info(
2453            Path::new("crash.json"),
2454            &[
2455                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2456            ],
2457        );
2458
2459        assert!(classification.is_binary);
2460        assert!(!classification.is_text);
2461        assert_eq!(classification.mime_type, "application/octet-stream");
2462    }
2463
2464    #[test]
2465    fn test_classify_file_info_treats_dockerfile_as_source() {
2466        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2467
2468        assert_eq!(
2469            classification.programming_language.as_deref(),
2470            Some("Dockerfile")
2471        );
2472        assert!(classification.is_source);
2473        assert!(!classification.is_script);
2474        assert_eq!(
2475            classification.file_type,
2476            "Dockerfile source, UTF-8 Unicode text"
2477        );
2478    }
2479
2480    #[test]
2481    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2482        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2483
2484        assert_eq!(classification.programming_language, None);
2485        assert!(classification.is_text);
2486        assert!(!classification.is_source);
2487        assert!(!classification.is_script);
2488        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2489    }
2490
2491    #[test]
2492    fn test_classify_file_info_marks_supported_package_archives() {
2493        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2494
2495        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2496        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2497
2498        assert!(egg.is_archive);
2499        assert_eq!(egg.mime_type, "application/zip");
2500        assert_eq!(egg.file_type, "Zip archive data");
2501        assert!(nupkg.is_archive);
2502        assert_eq!(nupkg.mime_type, "application/zip");
2503        assert_eq!(nupkg.file_type, "Zip archive data");
2504    }
2505
2506    #[test]
2507    fn test_classify_file_info_marks_png_as_binary_media() {
2508        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2509
2510        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2511
2512        assert_eq!(classification.mime_type, "image/png");
2513        assert_eq!(classification.file_type, "PNG image data");
2514        assert!(classification.is_binary);
2515        assert!(!classification.is_text);
2516        assert!(classification.is_media);
2517        assert!(!classification.is_archive);
2518        assert!(!classification.is_source);
2519    }
2520
2521    #[test]
2522    fn test_classify_file_info_marks_pdf_as_binary_document() {
2523        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2524
2525        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2526
2527        assert_eq!(classification.mime_type, "application/pdf");
2528        assert_eq!(classification.file_type, "PDF document");
2529        assert!(classification.is_binary);
2530        assert!(!classification.is_text);
2531        assert!(!classification.is_archive);
2532        assert!(!classification.is_media);
2533    }
2534
2535    #[test]
2536    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2537        let classification =
2538            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2539
2540        assert!(classification.is_binary);
2541        assert!(!classification.is_text);
2542        assert!(!classification.is_source);
2543        assert_eq!(classification.programming_language, None);
2544    }
2545
2546    #[test]
2547    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2548        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2549
2550        assert_eq!(classification.programming_language, None);
2551        assert!(classification.is_text);
2552        assert!(!classification.is_source);
2553        assert_eq!(classification.file_type, "YAML text data");
2554    }
2555
2556    #[test]
2557    fn test_classify_file_info_classifies_common_build_manifests() {
2558        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2559        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2560        let cmake = classify_file_info(
2561            Path::new("toolchain.cmake"),
2562            b"set(CMAKE_CXX_STANDARD 20)\n",
2563        );
2564        let gitmodules = classify_file_info(
2565            Path::new(".gitmodules"),
2566            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2567        );
2568
2569        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2570        assert!(gradle.is_source);
2571        assert_eq!(gradle.mime_type, "text/plain");
2572        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2573
2574        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2575        assert!(flake.is_source);
2576        assert_eq!(flake.mime_type, "text/plain");
2577        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2578
2579        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2580        assert!(cmake.is_source);
2581        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2582
2583        assert_eq!(gitmodules.programming_language, None);
2584        assert!(gitmodules.is_text);
2585        assert!(!gitmodules.is_source);
2586        assert_eq!(gitmodules.file_type, "Git configuration text");
2587    }
2588
2589    #[test]
2590    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2591        let header = classify_file_info(
2592            Path::new("include/demo.hpp"),
2593            b"#pragma once\nclass Demo {};\n",
2594        );
2595        let ipp = classify_file_info(
2596            Path::new("include/detail/demo.ipp"),
2597            b"template <class T> void parse() {}\n",
2598        );
2599
2600        assert_eq!(header.programming_language.as_deref(), Some("C++"));
2601        assert!(header.is_source);
2602        assert!(!header.is_script);
2603        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2604
2605        assert_eq!(ipp.programming_language, None);
2606        assert!(!ipp.is_source);
2607        assert!(!ipp.is_script);
2608        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2609    }
2610
2611    #[test]
2612    fn test_classify_file_info_preserves_specific_shell_family_labels() {
2613        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2614
2615        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2616        assert!(bash.is_script);
2617        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2618    }
2619
2620    #[test]
2621    fn test_classify_file_info_marks_jamfile_as_source() {
2622        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2623
2624        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2625        assert!(jamfile.is_source);
2626        assert!(!jamfile.is_script);
2627        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2628    }
2629
2630    #[test]
2631    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2632        let classification = classify_file_info(
2633            Path::new("bin/run"),
2634            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2635        );
2636
2637        assert_eq!(
2638            classification.programming_language.as_deref(),
2639            Some("JavaScript")
2640        );
2641        assert!(classification.is_script);
2642        assert_eq!(
2643            classification.file_type,
2644            "javascript script, UTF-8 Unicode text executable"
2645        );
2646    }
2647
2648    #[test]
2649    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2650        let classification = classify_file_info(
2651            Path::new("script.py"),
2652            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2653        );
2654
2655        assert_eq!(
2656            classification.programming_language.as_deref(),
2657            Some("Python")
2658        );
2659        assert!(classification.is_script);
2660        assert_eq!(classification.file_type, "python script, text executable");
2661    }
2662
2663    #[test]
2664    fn test_classify_file_info_treats_textual_tga_as_media() {
2665        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2666
2667        assert!(classification.is_media);
2668        assert!(classification.is_text);
2669        assert!(!classification.is_binary);
2670    }
2671
2672    #[test]
2673    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2674        let classification =
2675            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2676
2677        assert!(classification.is_binary);
2678        assert!(!classification.is_text);
2679        assert!(!classification.is_source);
2680        assert_eq!(classification.programming_language, None);
2681    }
2682
2683    #[test]
2684    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2685        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2686
2687        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2688
2689        assert!(text.is_empty());
2690        assert_eq!(kind, ExtractedTextKind::None);
2691    }
2692
2693    #[test]
2694    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2695        let cases = [
2696            (
2697                Path::new("bin/run"),
2698                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2699                Some("JavaScript"),
2700                true,
2701                true,
2702            ),
2703            (
2704                Path::new("Dockerfile"),
2705                b"FROM scratch\n".as_slice(),
2706                Some("Dockerfile"),
2707                true,
2708                false,
2709            ),
2710            (
2711                Path::new("package.json"),
2712                br#"{"name":"demo"}"#.as_slice(),
2713                None,
2714                false,
2715                false,
2716            ),
2717            (
2718                Path::new("config.yaml"),
2719                b"key: value\n".as_slice(),
2720                None,
2721                false,
2722                false,
2723            ),
2724            (
2725                Path::new("Makefile"),
2726                b"all:\n\techo hi\n".as_slice(),
2727                None,
2728                false,
2729                false,
2730            ),
2731        ];
2732
2733        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2734            let classification = classify_file_info(path, bytes);
2735
2736            assert_eq!(
2737                classification.programming_language.as_deref(),
2738                expected_language,
2739                "unexpected language for {}",
2740                path.display()
2741            );
2742            assert_eq!(
2743                classification.is_source,
2744                expected_is_source,
2745                "unexpected is_source for {}",
2746                path.display()
2747            );
2748            assert_eq!(
2749                classification.is_script,
2750                expected_is_script,
2751                "unexpected is_script for {}",
2752                path.display()
2753            );
2754        }
2755    }
2756}