Skip to main content

provenant/utils/
file.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4use std::borrow::Cow;
5use std::collections::BTreeSet;
6use std::fs;
7use std::io::{BufReader, Cursor, Read};
8use std::panic::{AssertUnwindSafe, catch_unwind};
9use std::path::Path;
10
11use chrono::{TimeZone, Utc};
12use file_format::{FileFormat, Kind as FileFormatKind};
13use flate2::read::ZlibDecoder;
14use glob::Pattern;
15use image::{ImageDecoder, ImageFormat, ImageReader};
16use mime_guess::from_path;
17use quick_xml::events::Event;
18use quick_xml::reader::Reader as XmlReader;
19
20use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
21use crate::utils::font::extract_font_metadata_text;
22use crate::utils::language::detect_language;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub enum ExtractedTextKind {
26    None,
27    Decoded,
28    FontMetadata,
29    Pdf,
30    BinaryStrings,
31    ImageMetadata,
32    WindowsExecutableMetadata,
33}
34
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub struct FileInfoClassification {
37    pub mime_type: String,
38    pub file_type: String,
39    pub programming_language: Option<String>,
40    pub is_binary: bool,
41    pub is_text: bool,
42    pub is_archive: bool,
43    pub is_media: bool,
44    pub is_source: bool,
45    pub is_script: bool,
46}
47
48const MAX_IMAGE_METADATA_VALUES: usize = 64;
49const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
50const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
51const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
52const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
53const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
54    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
55];
56const BINARY_EXTENSIONS: &[&str] = &[
57    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
58];
59const ARCHIVE_EXTENSIONS: &[&str] = &[
60    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
61    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
62];
63
64/// Get the last modified date of a file as a `YYYY-MM-DD` string.
65pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
66    metadata.modified().ok().map(|time: std::time::SystemTime| {
67        let seconds_since_epoch = time
68            .duration_since(std::time::UNIX_EPOCH)
69            .unwrap()
70            .as_secs() as i64;
71
72        Utc.timestamp_opt(seconds_since_epoch, 0)
73            .single()
74            .unwrap_or_else(Utc::now)
75            .format("%Y-%m-%d")
76            .to_string()
77    })
78}
79
80/// Check if a path should be excluded based on a list of glob patterns.
81pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
82    let path_str = path.to_string_lossy();
83    let file_name = path
84        .file_name()
85        .map(|name| name.to_string_lossy())
86        .unwrap_or_default();
87
88    for pattern in exclude_patterns {
89        // Match against full path
90        if pattern.matches(&path_str) {
91            return true;
92        }
93
94        // Match against just the file/directory name
95        if pattern.matches(&file_name) {
96            return true;
97        }
98    }
99
100    false
101}
102
103/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
104///
105/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
106/// so it can decode any byte sequence. This matches Python ScanCode's use of
107/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
108pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
109    match String::from_utf8(bytes.to_vec()) {
110        Ok(s) => s,
111        Err(e) => {
112            let bytes = e.into_bytes();
113            if has_binary_control_chars(&bytes) {
114                return String::new();
115            }
116            bytes.iter().map(|&b| b as char).collect()
117        }
118    }
119}
120
121pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
122    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
123    (text, kind)
124}
125
126pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
127    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
128        return Cow::Borrowed(text);
129    };
130    if !matches!(
131        extension.to_ascii_lowercase().as_str(),
132        "md" | "markdown" | "html" | "htm"
133    ) {
134        return Cow::Borrowed(text);
135    }
136
137    let mut hints = Vec::new();
138    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
139        hints.push("Creative Commons Attribution 4.0 International License".to_string());
140    }
141    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
142    {
143        hints.push(
144            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
145                .to_string(),
146        );
147    }
148
149    hints.extend(extract_shields_license_badge_hints(text));
150
151    if hints.is_empty() {
152        Cow::Borrowed(text)
153    } else {
154        let mut augmented =
155            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
156        augmented.push_str(text);
157        augmented.push_str("\n\n");
158        for (index, hint) in hints.into_iter().enumerate() {
159            if index > 0 {
160                augmented.push('\n');
161            }
162            augmented.push_str(&hint);
163        }
164        Cow::Owned(augmented)
165    }
166}
167
168fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
169    let mut hints = Vec::new();
170    let mut rest = text;
171    let needle = "img.shields.io/badge/license-";
172
173    while let Some(index) = rest.find(needle) {
174        let start = index + needle.len();
175        let suffix = &rest[start..];
176        let end = suffix
177            .find([')', ']', '"', '\'', ' ', '\n'])
178            .unwrap_or(suffix.len());
179        let badge = &suffix[..end];
180        let Some(badge) = badge.strip_suffix(".svg") else {
181            rest = &suffix[end..];
182            continue;
183        };
184
185        let mut segments: Vec<_> = badge
186            .split('-')
187            .filter(|segment| !segment.is_empty())
188            .collect();
189        if segments.len() < 2 {
190            rest = &suffix[end..];
191            continue;
192        }
193        segments.pop();
194        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
195        if !candidate.is_empty() {
196            hints.push(canonical_shields_license_hint(&candidate));
197        }
198
199        rest = &suffix[end..];
200    }
201
202    hints.sort();
203    hints.dedup();
204    hints
205}
206
207fn canonical_shields_license_hint(candidate: &str) -> String {
208    match candidate.trim() {
209        "MIT" => "The MIT License".to_string(),
210        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
211        other => format!("{other} License"),
212    }
213}
214
215pub(crate) fn extract_text_for_detection_with_diagnostics(
216    path: &Path,
217    bytes: &[u8],
218) -> (String, ExtractedTextKind, Option<String>) {
219    let ext = path
220        .extension()
221        .and_then(|e| e.to_str())
222        .map(|s| s.to_ascii_lowercase());
223    let detected_format = detect_file_format(bytes);
224
225    if looks_like_rtf(bytes, ext.as_deref()) {
226        let text = extract_rtf_text(bytes);
227        return if text.trim().is_empty() {
228            (String::new(), ExtractedTextKind::None, None)
229        } else {
230            (text, ExtractedTextKind::Decoded, None)
231        };
232    }
233
234    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
235        let (text, scan_error) = extract_pdf_text(path, bytes);
236        return if text.is_empty() {
237            (String::new(), ExtractedTextKind::None, scan_error)
238        } else {
239            (text, ExtractedTextKind::Pdf, None)
240        };
241    }
242
243    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
244        let text = extract_image_metadata_text(bytes, format);
245        return if text.is_empty() {
246            if is_supported_image_container(bytes, format) {
247                (String::new(), ExtractedTextKind::None, None)
248            } else {
249                let decoded = decode_bytes_to_string(bytes);
250                if decoded.is_empty() {
251                    (String::new(), ExtractedTextKind::None, None)
252                } else {
253                    (decoded, ExtractedTextKind::Decoded, None)
254                }
255            }
256        } else {
257            (text, ExtractedTextKind::ImageMetadata, None)
258        };
259    }
260
261    if let Some(text) = extract_font_metadata_text(path, bytes) {
262        return (text, ExtractedTextKind::FontMetadata, None);
263    }
264
265    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
266    let large_opaque_binary = windows_executable_metadata_text.is_none()
267        && is_large_opaque_binary_candidate(bytes, detected_format);
268
269    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
270        return windows_metadata_or_empty_result(windows_executable_metadata_text);
271    }
272
273    if should_skip_binary_string_extraction(path, bytes, detected_format) {
274        return (String::new(), ExtractedTextKind::None, None);
275    }
276
277    if !large_opaque_binary {
278        let decoded = decode_bytes_to_string(bytes);
279        if !decoded.is_empty() {
280            let combined =
281                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
282            return (combined, ExtractedTextKind::Decoded, None);
283        }
284    }
285
286    let text = if large_opaque_binary {
287        extract_sampled_printable_strings(bytes)
288    } else {
289        extract_printable_strings(bytes)
290    };
291    if text.is_empty() {
292        windows_metadata_or_empty_result(windows_executable_metadata_text)
293    } else {
294        (
295            combine_extracted_text_fragments(windows_executable_metadata_text, text),
296            ExtractedTextKind::BinaryStrings,
297            None,
298        )
299    }
300}
301
302fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
303    match prefix {
304        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
305        Some(prefix) if !prefix.is_empty() => prefix,
306        _ => suffix,
307    }
308}
309
310fn windows_metadata_or_empty_result(
311    windows_executable_metadata_text: Option<String>,
312) -> (String, ExtractedTextKind, Option<String>) {
313    if let Some(metadata_text) = windows_executable_metadata_text {
314        (
315            metadata_text,
316            ExtractedTextKind::WindowsExecutableMetadata,
317            None,
318        )
319    } else {
320        (String::new(), ExtractedTextKind::None, None)
321    }
322}
323
324pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
325    let detected_format = detect_file_format(bytes);
326    let detected_language = detect_language(path, bytes);
327    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
328    let is_text = !is_binary;
329    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
330    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
331    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
332    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
333    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
334    let programming_language = is_source.then(|| detected_language.clone()).flatten();
335    let file_type = detect_file_type(
336        path,
337        bytes,
338        detected_format,
339        &mime_type,
340        programming_language.as_deref(),
341        is_binary,
342        is_text,
343        is_archive,
344        is_media,
345        is_script,
346    );
347
348    FileInfoClassification {
349        mime_type,
350        file_type,
351        programming_language,
352        is_binary,
353        is_text,
354        is_archive,
355        is_media,
356        is_source,
357        is_script,
358    }
359}
360
361fn detect_file_format(bytes: &[u8]) -> FileFormat {
362    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
363}
364
365fn is_utf8_text(bytes: &[u8]) -> bool {
366    std::str::from_utf8(bytes).is_ok()
367}
368
369fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
370    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
371        return None;
372    }
373
374    let (is_le, body) = match bytes {
375        [0xFF, 0xFE, rest @ ..] => (true, rest),
376        [0xFE, 0xFF, rest @ ..] => (false, rest),
377        _ => return None,
378    };
379
380    if body.is_empty() || body.len() % 2 != 0 {
381        return None;
382    }
383
384    let code_units: Vec<u16> = body
385        .chunks_exact(2)
386        .map(|chunk| {
387            if is_le {
388                u16::from_le_bytes([chunk[0], chunk[1]])
389            } else {
390                u16::from_be_bytes([chunk[0], chunk[1]])
391            }
392        })
393        .collect();
394
395    std::char::decode_utf16(code_units)
396        .collect::<Result<String, _>>()
397        .ok()
398}
399
400fn has_binary_control_chars(bytes: &[u8]) -> bool {
401    let control_count = bytes
402        .iter()
403        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
404        .count();
405    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
406}
407
408fn has_decodable_text(bytes: &[u8]) -> bool {
409    bytes.is_empty()
410        || is_utf8_text(bytes)
411        || decode_utf16_bom_text(bytes).is_some()
412        || !has_binary_control_chars(bytes)
413}
414
415fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
416    if bytes.is_empty() || is_utf8_text(bytes) {
417        return true;
418    }
419    if let Some(decoded) = decode_utf16_bom_text(bytes) {
420        return decoded
421            .chars()
422            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
423    }
424
425    let printable_count = bytes
426        .iter()
427        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
428        .count();
429    printable_count * 2 >= bytes.len()
430}
431
432fn is_textual_media_type(media_type: &str) -> bool {
433    media_type.starts_with("text/")
434        || matches!(
435            media_type,
436            "application/json" | "application/xml" | "text/xml"
437        )
438        || media_type.ends_with("+json")
439        || media_type.ends_with("+xml")
440}
441
442fn is_textual_format(detected_format: FileFormat) -> bool {
443    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
444        || is_textual_media_type(detected_format.media_type())
445}
446
447fn is_known_binary_format(detected_format: FileFormat) -> bool {
448    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
449        && !is_textual_format(detected_format)
450}
451
452pub fn detect_mime_type(
453    path: &Path,
454    bytes: &[u8],
455    detected_format: FileFormat,
456    programming_language: Option<&str>,
457) -> String {
458    if bytes.is_empty() {
459        return "inode/x-empty".to_string();
460    }
461
462    if lower_extension(path).as_deref() == Some("json") {
463        if let Some(is_binary) = json_binary_override(bytes) {
464            if is_binary {
465                return "application/octet-stream".to_string();
466            }
467            if has_valid_json_text(bytes) {
468                return "application/json".to_string();
469            }
470            return "text/plain".to_string();
471        }
472        if has_valid_json_text(bytes) {
473            return "application/json".to_string();
474        }
475        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
476            return "text/plain".to_string();
477        }
478        return "application/octet-stream".to_string();
479    }
480
481    if is_zip_archive(bytes) {
482        return detect_zip_like_mime(path);
483    }
484
485    if looks_like_deb(bytes, path) {
486        return "application/vnd.debian.binary-package".to_string();
487    }
488
489    if looks_like_rpm(bytes, path) {
490        return "application/x-rpm".to_string();
491    }
492
493    let guessed_mime = from_path(path)
494        .first_or_octet_stream()
495        .essence_str()
496        .to_string();
497
498    let mime_type = match detected_format {
499        FileFormat::Empty => "inode/x-empty".to_string(),
500        FileFormat::PlainText => {
501            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
502                "text/plain".to_string()
503            } else {
504                guessed_mime.clone()
505            }
506        }
507        _ => {
508            let detected_mime = detected_format.media_type();
509            if detected_mime == "application/octet-stream"
510                && guessed_mime != "application/octet-stream"
511            {
512                guessed_mime.clone()
513            } else {
514                detected_mime.to_string()
515            }
516        }
517    };
518
519    normalize_mime_type(path, bytes, programming_language, &mime_type)
520}
521
522fn normalize_mime_type(
523    path: &Path,
524    bytes: &[u8],
525    programming_language: Option<&str>,
526    mime_type: &str,
527) -> String {
528    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
529        return "text/plain".to_string();
530    }
531
532    mime_type.to_string()
533}
534
535fn should_prefer_text_mime(
536    path: &Path,
537    bytes: &[u8],
538    programming_language: Option<&str>,
539    mime_type: &str,
540) -> bool {
541    has_decodable_text(bytes)
542        && looks_like_textual_bytes(bytes)
543        && is_textual_source_candidate(path, programming_language)
544        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
545}
546
547fn has_valid_json_text(bytes: &[u8]) -> bool {
548    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
549        return false;
550    }
551
552    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
553        || decode_utf16_bom_text(bytes)
554            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
555            .is_some()
556}
557
558fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
559    !bytes.contains(&0)
560        && !bytes.contains(&0xFF)
561        && bytes.starts_with(b"[\"")
562        && bytes.ends_with(b"\"]")
563        && bytes.len() >= 8
564}
565
566fn json_binary_override(bytes: &[u8]) -> Option<bool> {
567    if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
568        return Some(false);
569    }
570
571    if bytes.contains(&0) {
572        return Some(true);
573    }
574
575    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
576        return Some(true);
577    }
578
579    if is_wrapped_invalid_json_string_text(bytes) {
580        return Some(false);
581    }
582
583    None
584}
585
586fn detect_is_binary(
587    path: &Path,
588    bytes: &[u8],
589    detected_format: FileFormat,
590    programming_language: Option<&str>,
591) -> bool {
592    if lower_extension(path).as_deref() == Some("json")
593        && let Some(is_binary) = json_binary_override(bytes)
594    {
595        return is_binary;
596    }
597
598    if is_textual_format(detected_format) {
599        return false;
600    }
601
602    if lower_extension(path)
603        .as_deref()
604        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
605    {
606        return true;
607    }
608
609    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
610        return false;
611    }
612
613    has_binary_control_chars(bytes)
614        || is_known_binary_format(detected_format)
615        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
616            && !looks_like_textual_bytes(bytes))
617}
618
619fn should_treat_binary_bytes_as_text(
620    path: &Path,
621    bytes: &[u8],
622    programming_language: Option<&str>,
623) -> bool {
624    has_decodable_text(bytes)
625        && looks_like_textual_bytes(bytes)
626        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
627}
628
629fn detect_is_archive(
630    path: &Path,
631    bytes: &[u8],
632    mime_type: &str,
633    is_text: bool,
634    detected_format: FileFormat,
635) -> bool {
636    if is_text {
637        return false;
638    }
639
640    lower_extension(path)
641        .as_deref()
642        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
643        || matches!(
644            detected_format.kind(),
645            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
646        )
647        || is_zip_archive(bytes)
648        || looks_like_gzip(bytes)
649        || looks_like_bzip2(bytes)
650        || looks_like_xz(bytes)
651        || looks_like_deb(bytes, path)
652        || looks_like_rpm(bytes, path)
653        || looks_like_squashfs(bytes, path)
654        || mime_type.contains("zip")
655        || mime_type.contains("compressed")
656        || mime_type.contains("tar")
657        || mime_type.contains("x-rpm")
658        || mime_type.contains("debian")
659}
660
661fn detect_is_media(
662    path: &Path,
663    bytes: &[u8],
664    mime_type: &str,
665    detected_format: FileFormat,
666) -> bool {
667    media_mime_from_content(bytes).is_some()
668        || matches!(
669            detected_format.kind(),
670            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
671        )
672        || mime_type.starts_with("image/")
673        || mime_type.starts_with("audio/")
674        || mime_type.starts_with("video/")
675        || (mime_type == "application/octet-stream"
676            && lower_extension(path).as_deref() == Some("tga")
677            && !has_binary_control_chars(bytes))
678}
679
680fn detect_is_script(
681    path: &Path,
682    bytes: &[u8],
683    programming_language: Option<&str>,
684    is_text: bool,
685) -> bool {
686    if !is_text || is_makefile(path) {
687        return false;
688    }
689
690    bytes.starts_with(b"#!")
691        || lower_extension(path).as_deref().is_some_and(|ext| {
692            matches!(
693                ext,
694                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
695            )
696        })
697        || matches!(
698            programming_language,
699            Some(
700                "Shell"
701                    | "Bash"
702                    | "Zsh"
703                    | "Fish"
704                    | "Ksh"
705                    | "Python"
706                    | "Ruby"
707                    | "Perl"
708                    | "PHP"
709                    | "PowerShell"
710                    | "Awk"
711            )
712        )
713}
714
715fn detect_is_source(
716    path: &Path,
717    programming_language: Option<&str>,
718    is_text: bool,
719    is_script: bool,
720) -> bool {
721    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
722        return false;
723    }
724
725    if is_c_like_source(path) || is_java_like_source(path) {
726        return true;
727    }
728
729    programming_language.is_some() || is_script
730}
731
732#[allow(clippy::too_many_arguments)]
733fn detect_file_type(
734    path: &Path,
735    bytes: &[u8],
736    detected_format: FileFormat,
737    mime_type: &str,
738    programming_language: Option<&str>,
739    is_binary: bool,
740    is_text: bool,
741    is_archive: bool,
742    is_media: bool,
743    is_script: bool,
744) -> String {
745    if bytes.is_empty() {
746        return "empty".to_string();
747    }
748
749    if looks_like_pdf(bytes) {
750        return "PDF document".to_string();
751    }
752
753    if let Some(file_type) = media_file_type_from_content(bytes) {
754        return file_type.to_string();
755    }
756
757    if is_archive {
758        return archive_file_type(path, bytes, detected_format);
759    }
760
761    if is_script {
762        return script_file_type(programming_language, bytes);
763    }
764
765    if is_text {
766        if lower_extension(path).as_deref() == Some("json") {
767            if has_valid_json_text(bytes) {
768                return "JSON text data".to_string();
769            }
770            return text_file_type(bytes);
771        }
772        if lower_extension(path).as_deref() == Some("xml") {
773            return "XML text data".to_string();
774        }
775        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
776            return "YAML text data".to_string();
777        }
778        if lower_extension(path).as_deref() == Some("toml") {
779            return "TOML text data".to_string();
780        }
781        if matches!(
782            lower_extension(path).as_deref(),
783            Some("ini" | "cfg" | "conf")
784        ) {
785            return "INI text data".to_string();
786        }
787        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
788            return "Git configuration text".to_string();
789        }
790        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
791            return text_file_type(bytes);
792        }
793        if programming_language.is_some() && !is_media {
794            return source_file_type(programming_language, bytes);
795        }
796        return text_file_type(bytes);
797    }
798
799    if let Some(file_type) = format_based_file_type(detected_format) {
800        return file_type;
801    }
802
803    if is_binary && mime_type == "application/octet-stream" {
804        return "data".to_string();
805    }
806
807    mime_type.to_string()
808}
809
810fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
811    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
812        return true;
813    }
814
815    if matches!(
816        lower_file_name(path).as_str(),
817        "dockerfile"
818            | "containerfile"
819            | "containerfile.core"
820            | "apkbuild"
821            | "podfile"
822            | "jamfile"
823            | "jamroot"
824            | "meson.build"
825            | "build"
826            | "workspace"
827            | "buck"
828            | "default.nix"
829            | "flake.nix"
830            | "shell.nix"
831    ) {
832        return true;
833    }
834
835    path.extension()
836        .and_then(|ext| ext.to_str())
837        .is_some_and(|ext| {
838            matches!(
839                ext.to_ascii_lowercase().as_str(),
840                "rs" | "py"
841                    | "js"
842                    | "mjs"
843                    | "cjs"
844                    | "jsx"
845                    | "ts"
846                    | "mts"
847                    | "cts"
848                    | "tsx"
849                    | "c"
850                    | "cpp"
851                    | "cc"
852                    | "cxx"
853                    | "h"
854                    | "hpp"
855                    | "m"
856                    | "mm"
857                    | "s"
858                    | "asm"
859                    | "java"
860                    | "go"
861                    | "rb"
862                    | "php"
863                    | "pl"
864                    | "swift"
865                    | "sh"
866                    | "bash"
867                    | "zsh"
868                    | "fish"
869                    | "ksh"
870                    | "ps1"
871                    | "psm1"
872                    | "psd1"
873                    | "awk"
874                    | "kt"
875                    | "kts"
876                    | "dart"
877                    | "scala"
878                    | "groovy"
879                    | "gradle"
880                    | "gvy"
881                    | "gy"
882                    | "gsh"
883                    | "cs"
884                    | "fs"
885                    | "fsx"
886                    | "r"
887                    | "lua"
888                    | "jl"
889                    | "ex"
890                    | "exs"
891                    | "clj"
892                    | "cljs"
893                    | "cljc"
894                    | "hs"
895                    | "erl"
896                    | "nix"
897                    | "zig"
898                    | "bzl"
899                    | "bazel"
900                    | "star"
901                    | "sky"
902                    | "ml"
903                    | "mli"
904                    | "tex"
905            )
906        })
907}
908
909fn is_source_like_language(language: &str) -> bool {
910    matches!(
911        language,
912        "Rust"
913            | "Python"
914            | "JavaScript"
915            | "TypeScript"
916            | "JavaScript/TypeScript"
917            | "C"
918            | "C++"
919            | "Objective-C"
920            | "Objective-C++"
921            | "GAS"
922            | "Java"
923            | "Go"
924            | "Ruby"
925            | "PHP"
926            | "Perl"
927            | "Swift"
928            | "Shell"
929            | "PowerShell"
930            | "Awk"
931            | "Kotlin"
932            | "Dart"
933            | "Scala"
934            | "C#"
935            | "F#"
936            | "R"
937            | "Lua"
938            | "Julia"
939            | "Elixir"
940            | "Clojure"
941            | "Haskell"
942            | "Erlang"
943            | "Groovy"
944            | "Nix"
945            | "Zig"
946            | "Starlark"
947            | "OCaml"
948            | "Meson"
949            | "TeX"
950            | "Dockerfile"
951            | "Makefile"
952            | "Jamfile"
953    )
954}
955
956fn extension(path: &Path) -> Option<&str> {
957    path.extension().and_then(|ext| ext.to_str())
958}
959
960fn lower_extension(path: &Path) -> Option<String> {
961    extension(path).map(|ext| ext.to_ascii_lowercase())
962}
963
964fn lower_file_name(path: &Path) -> String {
965    path.file_name()
966        .and_then(|name| name.to_str())
967        .map(|name| name.to_ascii_lowercase())
968        .unwrap_or_default()
969}
970
971fn is_plain_text(path: &Path) -> bool {
972    lower_extension(path)
973        .as_deref()
974        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
975}
976
977fn is_makefile(path: &Path) -> bool {
978    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
979}
980
981fn is_source_map(path: &Path) -> bool {
982    let path_lower = path.to_string_lossy().to_ascii_lowercase();
983    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
984}
985
986fn is_c_like_source(path: &Path) -> bool {
987    lower_extension(path).as_deref().is_some_and(|ext| {
988        matches!(
989            ext,
990            "c" | "cc"
991                | "cp"
992                | "cpp"
993                | "cxx"
994                | "c++"
995                | "h"
996                | "hh"
997                | "hpp"
998                | "hxx"
999                | "h++"
1000                | "i"
1001                | "ii"
1002                | "m"
1003                | "s"
1004                | "asm"
1005        )
1006    })
1007}
1008
1009fn is_java_like_source(path: &Path) -> bool {
1010    lower_extension(path)
1011        .as_deref()
1012        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1013}
1014
1015fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1016    match detected_format {
1017        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1018        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1019        format => Some(match format.kind() {
1020            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1021            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1022            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1023            _ => format.name().to_string(),
1024        }),
1025    }
1026}
1027
1028fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1029    format
1030        .short_name()
1031        .map(|short_name| format!("{short_name} {suffix}"))
1032        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1033}
1034
1035fn detect_zip_like_mime(path: &Path) -> String {
1036    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1037        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1038        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1039            "application/java-archive".to_string()
1040        }
1041        _ => "application/zip".to_string(),
1042    }
1043}
1044
1045fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1046    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1047        Some("image/png")
1048    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1049        Some("image/jpeg")
1050    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1051        Some("image/tiff")
1052    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1053        Some("image/webp")
1054    } else {
1055        None
1056    }
1057}
1058
1059fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1060    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1061        Some("PNG image data")
1062    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1063        Some("JPEG image data")
1064    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1065        Some("TIFF image data")
1066    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1067        Some("WebP image data")
1068    } else {
1069        None
1070    }
1071}
1072
1073fn looks_like_pdf(bytes: &[u8]) -> bool {
1074    bytes.starts_with(b"%PDF-")
1075}
1076
1077fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1078    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1079}
1080
1081fn extract_rtf_text(bytes: &[u8]) -> String {
1082    let text = String::from_utf8_lossy(bytes);
1083    let chars: Vec<char> = text.chars().collect();
1084    let mut output = String::new();
1085    let mut index = 0usize;
1086
1087    while index < chars.len() {
1088        match chars[index] {
1089            '{' | '}' => {
1090                index += 1;
1091            }
1092            '\\' => {
1093                index += 1;
1094                if index >= chars.len() {
1095                    break;
1096                }
1097
1098                match chars[index] {
1099                    '\\' | '{' | '}' => {
1100                        output.push(chars[index]);
1101                        index += 1;
1102                    }
1103                    '\'' => {
1104                        if index + 2 < chars.len() {
1105                            let hex = [chars[index + 1], chars[index + 2]];
1106                            let hex: String = hex.iter().collect();
1107                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1108                                output.push(value as char);
1109                                index += 3;
1110                                continue;
1111                            }
1112                        }
1113                        index += 1;
1114                    }
1115                    control if control.is_ascii_alphabetic() => {
1116                        let start = index;
1117                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1118                            index += 1;
1119                        }
1120                        let control_word: String = chars[start..index].iter().collect();
1121
1122                        let number_start = index;
1123                        if index < chars.len()
1124                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1125                        {
1126                            index += 1;
1127                            while index < chars.len() && chars[index].is_ascii_digit() {
1128                                index += 1;
1129                            }
1130                        }
1131                        let parameter: String = chars[number_start..index].iter().collect();
1132
1133                        if index < chars.len() && chars[index] == ' ' {
1134                            index += 1;
1135                        }
1136
1137                        match control_word.as_str() {
1138                            "par" | "line" => output.push('\n'),
1139                            "tab" => output.push('\t'),
1140                            "emdash" => output.push('—'),
1141                            "endash" => output.push('–'),
1142                            "bullet" => output.push('•'),
1143                            "lquote" | "rquote" => output.push('\''),
1144                            "ldblquote" | "rdblquote" => output.push('"'),
1145                            "u" => {
1146                                if let Ok(codepoint) = parameter.parse::<i32>() {
1147                                    let normalized = if codepoint < 0 {
1148                                        codepoint + 65_536
1149                                    } else {
1150                                        codepoint
1151                                    };
1152                                    if let Ok(normalized) = u32::try_from(normalized)
1153                                        && let Some(ch) = char::from_u32(normalized)
1154                                    {
1155                                        output.push(ch);
1156                                    }
1157                                }
1158
1159                                if index < chars.len()
1160                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1161                                {
1162                                    index += 1;
1163                                }
1164                            }
1165                            _ => {}
1166                        }
1167                    }
1168                    _ => {
1169                        index += 1;
1170                    }
1171                }
1172            }
1173            ch => {
1174                output.push(ch);
1175                index += 1;
1176            }
1177        }
1178    }
1179
1180    output
1181        .replace(['\r', '\u{0c}'], "\n")
1182        .lines()
1183        .map(str::trim_end)
1184        .collect::<Vec<_>>()
1185        .join("\n")
1186}
1187
1188fn looks_like_gzip(bytes: &[u8]) -> bool {
1189    bytes.starts_with(&[0x1f, 0x8b])
1190}
1191
1192fn looks_like_bzip2(bytes: &[u8]) -> bool {
1193    bytes.starts_with(b"BZh")
1194}
1195
1196fn looks_like_xz(bytes: &[u8]) -> bool {
1197    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1198}
1199
1200fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1201    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1202}
1203
1204fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1205    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1206}
1207
1208fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1209    lower_extension(path)
1210        .as_deref()
1211        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1212        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1213            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1214}
1215
1216fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1217    if looks_like_deb(bytes, path) {
1218        "debian binary package (format 2.0)".to_string()
1219    } else if looks_like_rpm(bytes, path) {
1220        "RPM package".to_string()
1221    } else if looks_like_squashfs(bytes, path) {
1222        "Squashfs filesystem".to_string()
1223    } else if looks_like_gzip(bytes) {
1224        "gzip compressed data".to_string()
1225    } else if looks_like_bzip2(bytes) {
1226        "bzip2 compressed data".to_string()
1227    } else if looks_like_xz(bytes) {
1228        "XZ compressed data".to_string()
1229    } else if is_zip_archive(bytes) {
1230        "Zip archive data".to_string()
1231    } else if lower_extension(path).as_deref() == Some("gem") {
1232        "POSIX tar archive".to_string()
1233    } else if let Some(file_type) = format_based_file_type(detected_format) {
1234        file_type
1235    } else {
1236        "archive data".to_string()
1237    }
1238}
1239
1240fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1241    let suffix = text_executable_label(bytes);
1242
1243    match programming_language {
1244        Some("Python") => format!("python script, {suffix}"),
1245        Some("Ruby") => format!("ruby script, {suffix}"),
1246        Some("Perl") => format!("perl script, {suffix}"),
1247        Some("PHP") => format!("php script, {suffix}"),
1248        Some("Shell") => format!("shell script, {suffix}"),
1249        Some("Bash") => format!("bash script, {suffix}"),
1250        Some("Zsh") => format!("zsh script, {suffix}"),
1251        Some("Fish") => format!("fish script, {suffix}"),
1252        Some("Ksh") => format!("ksh script, {suffix}"),
1253        Some("JavaScript") => format!("javascript script, {suffix}"),
1254        Some("TypeScript") => format!("typescript script, {suffix}"),
1255        Some("PowerShell") => format!("powershell script, {suffix}"),
1256        Some("Awk") => format!("awk script, {suffix}"),
1257        _ => format!("script, {suffix}"),
1258    }
1259}
1260
1261fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1262    let suffix = text_label(bytes);
1263    match programming_language {
1264        Some("C") => format!("C source, {suffix}"),
1265        Some("C++") => format!("C++ source, {suffix}"),
1266        Some("Java") => format!("Java source, {suffix}"),
1267        Some("C#") => format!("C# source, {suffix}"),
1268        Some("F#") => format!("F# source, {suffix}"),
1269        Some("Go") => format!("Go source, {suffix}"),
1270        Some("Rust") => format!("Rust source, {suffix}"),
1271        Some("Starlark") => format!("Starlark source, {suffix}"),
1272        Some("CMake") => format!("CMake source, {suffix}"),
1273        Some("Meson") => format!("Meson source, {suffix}"),
1274        Some("Nix") => format!("Nix source, {suffix}"),
1275        Some("Groovy") => format!("Groovy source, {suffix}"),
1276        Some("Makefile") => format!("Makefile source, {suffix}"),
1277        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1278        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1279        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1280        Some(language) => format!("{language} source, {suffix}"),
1281        None => text_file_type(bytes),
1282    }
1283}
1284
1285fn text_file_type(bytes: &[u8]) -> String {
1286    text_label(bytes).to_string()
1287}
1288
1289fn text_label(bytes: &[u8]) -> &'static str {
1290    if std::str::from_utf8(bytes).is_ok() {
1291        if bytes.contains(&b'\n') {
1292            "UTF-8 Unicode text"
1293        } else {
1294            "UTF-8 Unicode text, with no line terminators"
1295        }
1296    } else if bytes.contains(&b'\n') {
1297        "text"
1298    } else {
1299        "text, with no line terminators"
1300    }
1301}
1302
1303fn text_executable_label(bytes: &[u8]) -> &'static str {
1304    if std::str::from_utf8(bytes).is_ok() {
1305        if bytes.contains(&b'\n') {
1306            "UTF-8 Unicode text executable"
1307        } else {
1308            "UTF-8 Unicode text executable, with no line terminators"
1309        }
1310    } else if bytes.contains(&b'\n') {
1311        "text executable"
1312    } else {
1313        "text executable, with no line terminators"
1314    }
1315}
1316
1317fn supported_image_metadata_format(
1318    ext: Option<&str>,
1319    detected_format: FileFormat,
1320) -> Option<ImageFormat> {
1321    match ext {
1322        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1323        Some("png") => Some(ImageFormat::Png),
1324        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1325        Some("webp") => Some(ImageFormat::WebP),
1326        _ => match detected_format.media_type() {
1327            "image/jpeg" => Some(ImageFormat::Jpeg),
1328            "image/png" => Some(ImageFormat::Png),
1329            "image/tiff" => Some(ImageFormat::Tiff),
1330            "image/webp" => Some(ImageFormat::WebP),
1331            _ => None,
1332        },
1333    }
1334}
1335
1336fn should_skip_binary_string_extraction(
1337    path: &Path,
1338    bytes: &[u8],
1339    detected_format: FileFormat,
1340) -> bool {
1341    matches!(lower_extension(path).as_deref(), Some("pdf"))
1342        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1343            .is_some()
1344        || (matches!(
1345            detected_format.kind(),
1346            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1347        ) && !is_textual_format(detected_format))
1348        || media_mime_from_content(bytes).is_some()
1349        || is_zip_archive(bytes)
1350        || looks_like_gzip(bytes)
1351        || looks_like_bzip2(bytes)
1352        || looks_like_xz(bytes)
1353        || looks_like_deb(bytes, path)
1354        || looks_like_rpm(bytes, path)
1355        || looks_like_squashfs(bytes, path)
1356}
1357
1358fn should_skip_large_opaque_binary_text_extraction(
1359    _path: &Path,
1360    bytes: &[u8],
1361    detected_format: FileFormat,
1362) -> bool {
1363    is_large_opaque_binary_candidate(bytes, detected_format)
1364        && !sample_has_promising_printable_strings(bytes)
1365}
1366
1367fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1368    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1369        && !is_textual_format(detected_format)
1370        && !matches!(
1371            detected_format.kind(),
1372            FileFormatKind::Archive
1373                | FileFormatKind::Compressed
1374                | FileFormatKind::Package
1375                | FileFormatKind::Audio
1376                | FileFormatKind::Image
1377                | FileFormatKind::Video
1378        )
1379}
1380
1381fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1382    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1383
1384    let mut ranges = Vec::new();
1385    let mut push_range = |start: usize, end: usize| {
1386        if start < end && !ranges.contains(&(start, end)) {
1387            ranges.push((start, end));
1388        }
1389    };
1390
1391    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1392    if len > SAMPLE_WINDOW_BYTES * 2 {
1393        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1394        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1395        push_range(mid_start, mid_end);
1396    }
1397    if len > SAMPLE_WINDOW_BYTES {
1398        push_range(len - SAMPLE_WINDOW_BYTES, len);
1399    }
1400
1401    ranges
1402}
1403
1404fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1405    let mut structured_signal_seen = false;
1406    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1407        .into_iter()
1408        .filter(|&(start, end)| {
1409            let window = &bytes[start..end];
1410            if has_strong_structured_text_signal(window) {
1411                structured_signal_seen = true;
1412            }
1413            has_license_or_notice_signal(window)
1414        })
1415        .count();
1416
1417    structured_signal_seen || promising_license_windows >= 2
1418}
1419
1420fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1421    let mut combined_lines = BTreeSet::new();
1422
1423    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1424        let window_text = extract_printable_strings(&bytes[start..end]);
1425        for line in window_text
1426            .lines()
1427            .map(str::trim)
1428            .filter(|line| !line.is_empty())
1429        {
1430            combined_lines.insert(line.to_string());
1431        }
1432    }
1433
1434    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1435}
1436
1437fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1438    let strings = extract_printable_strings(bytes);
1439    if strings.is_empty() {
1440        return false;
1441    }
1442
1443    let lower = strings.to_ascii_lowercase();
1444    [
1445        "copyright",
1446        "license",
1447        "licensed under",
1448        "all rights reserved",
1449        "permission is hereby granted",
1450        "redistribution and use",
1451        "spdx-license-identifier",
1452    ]
1453    .iter()
1454    .any(|marker| lower.contains(marker))
1455}
1456
1457fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1458    let strings = extract_printable_strings(bytes);
1459    if strings.is_empty() {
1460        return false;
1461    }
1462
1463    let email_markers = strings.matches('@').count();
1464    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1465
1466    email_markers + url_markers >= 3
1467}
1468
1469fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1470    match format {
1471        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1472        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1473        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1474        ImageFormat::WebP => {
1475            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1476        }
1477        _ => false,
1478    }
1479}
1480
1481fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1482    let mut values = Vec::new();
1483    values.extend(extract_exif_metadata_values(bytes));
1484    values.extend(extract_xmp_metadata_values(bytes, format));
1485    values_to_text(values)
1486}
1487
1488fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1489    let mut cursor = BufReader::new(Cursor::new(bytes));
1490    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1491        Ok(exif) => exif,
1492        Err(_) => return Vec::new(),
1493    };
1494
1495    let mut values = Vec::new();
1496    for field in exif.fields() {
1497        let rendered = match field.tag {
1498            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1499                Some(field.display_value().with_unit(&exif).to_string())
1500            }
1501            exif::Tag::Artist => Some(format!(
1502                "Author: {}",
1503                field.display_value().with_unit(&exif)
1504            )),
1505            _ => None,
1506        };
1507
1508        if let Some(rendered) = rendered {
1509            values.push(rendered);
1510        }
1511    }
1512
1513    values
1514}
1515
1516fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1517    let xmp = match extract_raw_xmp_packet(bytes, format) {
1518        Some(xmp) => xmp,
1519        None => return Vec::new(),
1520    };
1521
1522    parse_xmp_values(&xmp)
1523}
1524
1525fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1526    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1527    if let Ok(mut decoder) = reader.into_decoder()
1528        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1529    {
1530        return Some(xmp);
1531    }
1532
1533    match format {
1534        ImageFormat::Png => extract_png_xmp_packet(bytes),
1535        _ => None,
1536    }
1537}
1538
1539fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1540    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1541
1542    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1543        return None;
1544    }
1545
1546    let mut offset = PNG_SIGNATURE.len();
1547    while offset + 12 <= bytes.len() {
1548        let length = u32::from_be_bytes([
1549            bytes[offset],
1550            bytes[offset + 1],
1551            bytes[offset + 2],
1552            bytes[offset + 3],
1553        ]) as usize;
1554        let chunk_start = offset + 8;
1555        let chunk_end = chunk_start + length;
1556        if chunk_end + 4 > bytes.len() {
1557            return None;
1558        }
1559
1560        let chunk_type = &bytes[offset + 4..offset + 8];
1561        if chunk_type == b"iTXt" {
1562            let data = &bytes[chunk_start..chunk_end];
1563            if let Some(xmp) = parse_png_itxt_xmp(data) {
1564                return Some(xmp);
1565            }
1566        }
1567
1568        offset = chunk_end + 4;
1569    }
1570
1571    None
1572}
1573
1574fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1575    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1576
1577    let keyword_end = data.iter().position(|&b| b == 0)?;
1578    if &data[..keyword_end] != XMP_KEYWORD {
1579        return None;
1580    }
1581
1582    let mut cursor = keyword_end + 1;
1583    let compression_flag = *data.get(cursor)?;
1584    cursor += 1;
1585    let compression_method = *data.get(cursor)?;
1586    cursor += 1;
1587    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1588        return None;
1589    }
1590
1591    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1592    cursor = language_end + 1;
1593
1594    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1595    cursor = translated_end + 1;
1596
1597    let text_bytes = &data[cursor..];
1598    if compression_flag == 1 {
1599        let mut decoder = ZlibDecoder::new(text_bytes);
1600        let mut decoded = Vec::new();
1601        decoder.read_to_end(&mut decoded).ok()?;
1602        Some(decoded)
1603    } else {
1604        Some(text_bytes.to_vec())
1605    }
1606}
1607
1608fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1609    let mut reader = XmlReader::from_reader(xmp);
1610    reader.config_mut().trim_text(true);
1611
1612    let mut buf = Vec::new();
1613    let mut stack: Vec<String> = Vec::new();
1614    let mut values = Vec::new();
1615
1616    loop {
1617        match reader.read_event_into(&mut buf) {
1618            Ok(Event::Start(e)) => {
1619                stack.push(local_xml_name(e.name().as_ref()));
1620            }
1621            Ok(Event::End(_)) => {
1622                stack.pop();
1623            }
1624            Ok(Event::Empty(_)) => {}
1625            Ok(Event::Text(text)) => {
1626                if let Some(field) = stack
1627                    .iter()
1628                    .rev()
1629                    .find_map(|name| allowed_xmp_field(name.as_str()))
1630                    && let Ok(decoded) = text.decode()
1631                {
1632                    let decoded = decoded.into_owned();
1633                    if !decoded.trim().is_empty() {
1634                        values.push(format_xmp_value(field, &decoded));
1635                    }
1636                }
1637            }
1638            Ok(Event::CData(text)) => {
1639                if let Some(field) = stack
1640                    .iter()
1641                    .rev()
1642                    .find_map(|name| allowed_xmp_field(name.as_str()))
1643                    && let Ok(decoded) = text.decode()
1644                {
1645                    let decoded = decoded.into_owned();
1646                    if !decoded.trim().is_empty() {
1647                        values.push(format_xmp_value(field, &decoded));
1648                    }
1649                }
1650            }
1651            Ok(Event::Eof) | Err(_) => break,
1652            _ => {}
1653        }
1654        buf.clear();
1655    }
1656
1657    values
1658}
1659
1660fn local_xml_name(name: &[u8]) -> String {
1661    let name = std::str::from_utf8(name).unwrap_or_default();
1662    name.rsplit(':').next().unwrap_or(name).to_string()
1663}
1664
1665fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1666    match name {
1667        "creator" => Some("creator"),
1668        "rights" => Some("rights"),
1669        "description" => Some("description"),
1670        "title" => Some("title"),
1671        "subject" => Some("subject"),
1672        "UsageTerms" => Some("usage_terms"),
1673        "WebStatement" => Some("web_statement"),
1674        _ => None,
1675    }
1676}
1677
1678fn format_xmp_value(field: &str, value: &str) -> String {
1679    match field {
1680        "creator" => format!("Author: {value}"),
1681        _ => value.to_string(),
1682    }
1683}
1684
1685fn values_to_text(values: Vec<String>) -> String {
1686    let mut seen = BTreeSet::new();
1687    let mut lines = Vec::new();
1688    let mut total_bytes = 0usize;
1689
1690    for value in values {
1691        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1692            break;
1693        }
1694
1695        let normalized = normalize_metadata_value(&value);
1696        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1697            continue;
1698        }
1699
1700        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1701        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1702            break;
1703        }
1704
1705        total_bytes += added_bytes;
1706        lines.push(normalized);
1707    }
1708
1709    lines.join("\n")
1710}
1711
1712fn normalize_metadata_value(value: &str) -> String {
1713    value
1714        .chars()
1715        .filter(|&ch| ch != '\0')
1716        .collect::<String>()
1717        .split_whitespace()
1718        .collect::<Vec<_>>()
1719        .join(" ")
1720        .trim()
1721        .to_string()
1722}
1723
1724fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1725    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1726        return (String::new(), None);
1727    }
1728
1729    let mut failures = Vec::new();
1730    let mut saw_success = false;
1731
1732    let extracted = catch_unwind(AssertUnwindSafe(
1733        || -> Result<String, Box<dyn std::error::Error>> {
1734            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1735            extract_first_pdf_page_text(&mut document)
1736        },
1737    ));
1738    match extracted {
1739        Ok(Ok(text)) => {
1740            saw_success = true;
1741            if let Some(normalized) = normalize_pdf_text(text) {
1742                return (normalized, None);
1743            }
1744        }
1745        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1746        Err(payload) => failures.push(format!(
1747            "from-bytes first-page panic: {}",
1748            panic_payload_to_string(payload.as_ref())
1749        )),
1750    }
1751
1752    let extracted = catch_unwind(AssertUnwindSafe(
1753        || -> Result<String, Box<dyn std::error::Error>> {
1754            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1755            extract_pdf_text_from_document(&mut document)
1756        },
1757    ));
1758    match extracted {
1759        Ok(Ok(text)) => {
1760            saw_success = true;
1761            if let Some(normalized) = normalize_pdf_text(text) {
1762                return (normalized, None);
1763            }
1764        }
1765        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1766        Err(payload) => failures.push(format!(
1767            "open full-document panic: {}",
1768            panic_payload_to_string(payload.as_ref())
1769        )),
1770    }
1771
1772    let extracted = catch_unwind(AssertUnwindSafe(
1773        || -> Result<String, Box<dyn std::error::Error>> {
1774            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1775            extract_pdf_text_from_document(&mut document)
1776        },
1777    ));
1778    match extracted {
1779        Ok(Ok(text)) => {
1780            saw_success = true;
1781            if let Some(normalized) = normalize_pdf_text(text) {
1782                return (normalized, None);
1783            }
1784        }
1785        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1786        Err(payload) => failures.push(format!(
1787            "from-bytes full-document panic: {}",
1788            panic_payload_to_string(payload.as_ref())
1789        )),
1790    }
1791
1792    if saw_success || is_non_actionable_pdf_failure(&failures) {
1793        (String::new(), None)
1794    } else {
1795        (
1796            String::new(),
1797            Some(format!(
1798                "PDF text extraction failed after {} attempts: {}",
1799                failures.len(),
1800                failures.join("; ")
1801            )),
1802        )
1803    }
1804}
1805
1806fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1807    !failures.is_empty()
1808        && failures.iter().all(|failure| {
1809            failure.contains("requires a password")
1810                || failure.contains("Encrypt dictionary missing /O")
1811                || failure.contains("Encrypt dictionary missing /U")
1812                || failure.contains("security handler cannot be found")
1813                || failure.contains("Invalid cross-reference table")
1814        })
1815}
1816
1817fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1818    if let Some(message) = payload.downcast_ref::<&str>() {
1819        (*message).to_string()
1820    } else if let Some(message) = payload.downcast_ref::<String>() {
1821        message.clone()
1822    } else {
1823        "unknown panic payload".to_string()
1824    }
1825}
1826
1827fn extract_first_pdf_page_text(
1828    document: &mut pdf_oxide::document::PdfDocument,
1829) -> Result<String, Box<dyn std::error::Error>> {
1830    if document.page_count()? == 0 {
1831        return Ok(String::new());
1832    }
1833
1834    let extracted_text = document.extract_text(0)?;
1835    let markdown_text =
1836        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1837    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1838        return Ok(extracted_text);
1839    }
1840
1841    let pipeline_text =
1842        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1843
1844    Ok(merge_pdf_first_page_text(
1845        &extracted_text,
1846        &markdown_text,
1847        &pipeline_text,
1848    ))
1849}
1850
1851fn extract_pdf_text_from_document(
1852    document: &mut pdf_oxide::document::PdfDocument,
1853) -> Result<String, Box<dyn std::error::Error>> {
1854    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1855}
1856
1857fn normalize_pdf_text(text: String) -> Option<String> {
1858    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1859    (!normalized.trim().is_empty()).then_some(normalized)
1860}
1861
1862fn merge_pdf_first_page_text(
1863    _extracted_text: &str,
1864    markdown_text: &str,
1865    pipeline_text: &str,
1866) -> String {
1867    let pipeline = pipeline_text.trim();
1868    if pipeline.is_empty() {
1869        return String::new();
1870    }
1871
1872    let prefix = pdf_first_page_heading_prefix(markdown_text);
1873    let Some(prefix) = prefix else {
1874        return pipeline_text.to_string();
1875    };
1876
1877    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1878        pipeline_text.to_string()
1879    } else {
1880        format!("{prefix}\n\n{pipeline}")
1881    }
1882}
1883
1884fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1885    normalize_pdf_heading_comparison_text(text)
1886        .contains(&normalize_pdf_heading_comparison_text(prefix))
1887}
1888
1889fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1890    text.split_whitespace()
1891        .map(|part| part.to_ascii_lowercase())
1892        .collect::<Vec<_>>()
1893        .join(" ")
1894}
1895
1896fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1897    let mut lines = Vec::new();
1898
1899    for line in pdf_markdown_heading_lines(markdown_text) {
1900        push_unique_line(&mut lines, line);
1901    }
1902
1903    (!lines.is_empty()).then(|| lines.join("\n"))
1904}
1905
1906fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1907    text.lines()
1908        .map(str::trim)
1909        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1910        .map(|line| line.trim_matches('#').trim())
1911        .filter(|line| !line.is_empty())
1912        .filter(|line| !looks_like_numbered_section_heading(line))
1913        .take(4)
1914        .map(ToOwned::to_owned)
1915        .collect()
1916}
1917
1918fn push_unique_line(lines: &mut Vec<String>, line: String) {
1919    if !lines.iter().any(|existing| existing == &line) {
1920        lines.push(line);
1921    }
1922}
1923
1924fn looks_like_numbered_section_heading(line: &str) -> bool {
1925    let mut chars = line.chars();
1926    let Some(first) = chars.next() else {
1927        return false;
1928    };
1929
1930    if !first.is_ascii_digit() {
1931        return false;
1932    }
1933
1934    matches!(chars.next(), Some('.'))
1935}
1936
1937fn is_zip_archive(bytes: &[u8]) -> bool {
1938    bytes.starts_with(b"PK\x03\x04")
1939        || bytes.starts_with(b"PK\x05\x06")
1940        || bytes.starts_with(b"PK\x07\x08")
1941}
1942
1943pub fn extract_printable_strings(bytes: &[u8]) -> String {
1944    const MIN_LEN: usize = 4;
1945    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1946    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1947
1948    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1949
1950    fn is_printable_ascii(b: u8) -> bool {
1951        matches!(b, 0x20..=0x7E)
1952    }
1953
1954    let mut out = String::new();
1955    let mut run: Vec<u8> = Vec::new();
1956
1957    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1958        if run.len() >= MIN_LEN {
1959            if !out.is_empty() {
1960                out.push('\n');
1961            }
1962            out.push_str(&String::from_utf8_lossy(run));
1963        }
1964        run.clear();
1965    };
1966
1967    for &b in bytes {
1968        if is_printable_ascii(b) {
1969            run.push(b);
1970        } else {
1971            flush_run(&mut out, &mut run);
1972            if out.len() >= max_output_bytes {
1973                return out;
1974            }
1975        }
1976    }
1977    flush_run(&mut out, &mut run);
1978    if out.len() >= max_output_bytes {
1979        return out;
1980    }
1981
1982    for start in 0..=1 {
1983        run.clear();
1984        let mut i = start;
1985        while i + 1 < bytes.len() {
1986            let b0 = bytes[i];
1987            let b1 = bytes[i + 1];
1988            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1989            if is_printable_ascii(ch) && zero == 0 {
1990                run.push(ch);
1991            } else {
1992                flush_run(&mut out, &mut run);
1993                if out.len() >= max_output_bytes {
1994                    return out;
1995                }
1996            }
1997            i += 2;
1998        }
1999        flush_run(&mut out, &mut run);
2000        if out.len() >= max_output_bytes {
2001            return out;
2002        }
2003    }
2004
2005    out
2006}
2007
2008#[cfg(test)]
2009mod tests {
2010    use std::path::Path;
2011
2012    use super::{
2013        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2014        extract_printable_strings, extract_text_for_detection,
2015        extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2016        normalize_mime_type, normalize_pdf_heading_comparison_text,
2017        windows_metadata_or_empty_result,
2018    };
2019
2020    #[test]
2021    fn test_extract_text_for_detection_skips_jar_archives() {
2022        let path = Path::new(
2023            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2024        );
2025        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2026
2027        let (text, kind) = extract_text_for_detection(path, &bytes);
2028
2029        assert!(text.is_empty());
2030        assert_eq!(kind, ExtractedTextKind::None);
2031    }
2032
2033    #[test]
2034    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2035        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2036        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2037
2038        let (text, kind) = extract_text_for_detection(path, &bytes);
2039
2040        assert_eq!(kind, ExtractedTextKind::Pdf);
2041        assert!(text.contains("Redistribution and use in source and binary forms"));
2042    }
2043
2044    #[test]
2045    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2046        let path =
2047            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2048        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2049
2050        let (text, kind) = extract_text_for_detection(path, &bytes);
2051
2052        assert_eq!(kind, ExtractedTextKind::Pdf);
2053        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2054        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2055    }
2056
2057    #[test]
2058    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2059        let path =
2060            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2061        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2062
2063        let (text, kind) = extract_text_for_detection(path, &bytes);
2064
2065        assert_eq!(kind, ExtractedTextKind::Pdf);
2066
2067        let normalized = normalize_pdf_heading_comparison_text(&text);
2068        let heading =
2069            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2070        assert_eq!(normalized.matches(&heading).count(), 1);
2071    }
2072
2073    #[test]
2074    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2075        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2076        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2077
2078        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2079
2080        assert_eq!(kind, ExtractedTextKind::Pdf);
2081        assert!(text.contains("Redistribution and use in source and binary forms"));
2082    }
2083
2084    #[test]
2085    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2086        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2087
2088        let (text, kind, scan_error) =
2089            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2090
2091        assert!(text.is_empty());
2092        assert_eq!(kind, ExtractedTextKind::None);
2093        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2094        assert!(scan_error.contains("PDF text extraction failed after"));
2095    }
2096
2097    #[test]
2098    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2099        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2100
2101        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2102
2103        assert!(text.is_empty());
2104        assert_eq!(kind, ExtractedTextKind::None);
2105    }
2106
2107    #[test]
2108    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2109        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2110        let text = b"Copyright 2026 Example Project!!!";
2111        bytes[..text.len()].copy_from_slice(text);
2112        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2113        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2114
2115        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2116
2117        assert_ne!(kind, ExtractedTextKind::None);
2118        assert!(text.contains("Copyright 2026 Example Project"));
2119    }
2120
2121    #[test]
2122    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2123        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2124        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2125        bytes[..noise.len()].copy_from_slice(noise);
2126        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2127        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2128
2129        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2130
2131        assert!(text.is_empty());
2132        assert_eq!(kind, ExtractedTextKind::None);
2133    }
2134
2135    #[test]
2136    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2137        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2138        let bytes = std::fs::read(path).expect("read PE fixture");
2139
2140        let (text, kind) = extract_text_for_detection(path, &bytes);
2141
2142        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2143        assert!(text.contains("License: This program is free software"));
2144        assert!(text.contains("LegalCopyright:"));
2145    }
2146
2147    #[test]
2148    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2149    {
2150        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2151        let mut bytes = std::fs::read(path).expect("read PE fixture");
2152        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2153
2154        let (text, kind) = extract_text_for_detection(path, &bytes);
2155
2156        assert_ne!(kind, ExtractedTextKind::None);
2157        assert!(!text.trim().is_empty());
2158    }
2159
2160    #[test]
2161    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2162        let (text, kind, scan_error) =
2163            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2164
2165        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2166        assert_eq!(text, "LegalCopyright: Example Corp");
2167        assert!(scan_error.is_none());
2168    }
2169
2170    #[test]
2171    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2172        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2173        let text = b"Copyright 2026 Example Project!!!";
2174        bytes[..text.len()].copy_from_slice(text);
2175
2176        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2177
2178        assert!(text.is_empty());
2179        assert_eq!(kind, ExtractedTextKind::None);
2180    }
2181
2182    #[test]
2183    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2184        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2185        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2186        bytes[..text.len()].copy_from_slice(text);
2187
2188        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2189
2190        assert_ne!(kind, ExtractedTextKind::None);
2191        assert!(text.contains("asn@redhat.com"));
2192        assert!(text.contains("https://publicsuffix.org/"));
2193    }
2194
2195    #[test]
2196    fn test_non_actionable_pdf_failures_are_suppressed() {
2197        assert!(is_non_actionable_pdf_failure(&[
2198            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2199            "open full-document: PDF is encrypted and requires a password".to_string(),
2200        ]));
2201        assert!(is_non_actionable_pdf_failure(&[
2202            "from-bytes first-page: Invalid cross-reference table".to_string(),
2203            "open full-document: Invalid cross-reference table".to_string(),
2204        ]));
2205        assert!(is_non_actionable_pdf_failure(&[
2206            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2207            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2208        ]));
2209        assert!(!is_non_actionable_pdf_failure(&[
2210            "from-bytes first-page: some other parser failure".to_string(),
2211        ]));
2212    }
2213
2214    #[test]
2215    fn test_extract_text_for_detection_skips_zip_like_archives() {
2216        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2217
2218        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2219        let (crate_text, crate_kind) =
2220            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2221
2222        assert!(whl_text.is_empty());
2223        assert_eq!(whl_kind, ExtractedTextKind::None);
2224        assert!(crate_text.is_empty());
2225        assert_eq!(crate_kind, ExtractedTextKind::None);
2226    }
2227
2228    #[test]
2229    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2230        let path =
2231            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2232        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2233
2234        let (text, kind) = extract_text_for_detection(path, &bytes);
2235
2236        assert_ne!(kind, ExtractedTextKind::None);
2237        assert!(text.contains("Copyright nexB and others (c) 2012"));
2238    }
2239
2240    #[test]
2241    fn test_extract_text_for_detection_reads_font_metadata() {
2242        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2243        let bytes = std::fs::read(path).expect("failed to read font fixture");
2244
2245        let (text, kind) = extract_text_for_detection(path, &bytes);
2246
2247        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2248        assert!(text.contains("License Description:"), "{text}");
2249        assert!(
2250            text.contains("Open Font License") || text.contains("OFL"),
2251            "{text}"
2252        );
2253    }
2254
2255    #[test]
2256    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2257        let bytes = b"abcd\0".repeat(525_000);
2258
2259        let text = extract_printable_strings(&bytes);
2260
2261        assert!(
2262            text.len() > 2_000_000,
2263            "unexpected truncation at {}",
2264            text.len()
2265        );
2266        assert!(text.ends_with("abcd"));
2267    }
2268
2269    #[test]
2270    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2271        let path = Path::new(
2272            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2273        );
2274        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2275
2276        let (text, kind) = extract_text_for_detection(path, &bytes);
2277
2278        assert_eq!(kind, ExtractedTextKind::Decoded);
2279        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2280    }
2281
2282    #[test]
2283    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2284        let path = Path::new(
2285            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2286        );
2287        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2288
2289        let (text, kind) = extract_text_for_detection(path, &bytes);
2290
2291        assert_eq!(kind, ExtractedTextKind::Decoded);
2292        assert!(text.contains("GNU Lesser General Public"));
2293        assert!(text.contains("version"));
2294        assert!(text.contains("2.1 of the License"));
2295    }
2296
2297    #[test]
2298    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2299        assert_eq!(
2300            normalize_mime_type(
2301                Path::new("main.ts"),
2302                b"export const answer = 42;\n",
2303                Some("TypeScript"),
2304                "video/mp2t",
2305            ),
2306            "text/plain"
2307        );
2308    }
2309
2310    #[test]
2311    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2312        assert_eq!(
2313            normalize_mime_type(
2314                Path::new("main.js"),
2315                b"console.log('hello');\n",
2316                Some("JavaScript"),
2317                "application/octet-stream",
2318            ),
2319            "text/plain"
2320        );
2321    }
2322
2323    #[test]
2324    fn test_normalize_mime_type_preserves_binary_video_guess() {
2325        assert_eq!(
2326            normalize_mime_type(
2327                Path::new("main.ts"),
2328                &[0, 159, 146, 150, 0, 1, 2, 3],
2329                Some("TypeScript"),
2330                "video/mp2t",
2331            ),
2332            "video/mp2t"
2333        );
2334    }
2335
2336    #[test]
2337    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2338        assert_eq!(
2339            normalize_mime_type(
2340                Path::new("main.ts"),
2341                &[0, 159, 146, 150],
2342                Some("TypeScript"),
2343                "application/octet-stream",
2344            ),
2345            "application/octet-stream"
2346        );
2347    }
2348
2349    #[test]
2350    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2351        let classification = classify_file_info(Path::new("test.txt"), b"");
2352
2353        assert_eq!(classification.mime_type, "inode/x-empty");
2354        assert_eq!(classification.file_type, "empty");
2355        assert!(!classification.is_binary);
2356        assert!(classification.is_text);
2357        assert!(!classification.is_source);
2358        assert_eq!(classification.programming_language, None);
2359    }
2360
2361    #[test]
2362    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2363        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2364
2365        assert_eq!(classification.mime_type, "application/json");
2366        assert_eq!(classification.file_type, "JSON text data");
2367        assert!(classification.is_text);
2368        assert!(!classification.is_source);
2369        assert_eq!(classification.programming_language, None);
2370    }
2371
2372    #[test]
2373    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2374        let classification =
2375            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2376
2377        assert_eq!(classification.mime_type, "text/plain");
2378        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2379        assert!(classification.is_text);
2380        assert!(!classification.is_binary);
2381    }
2382
2383    #[test]
2384    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2385        let classification =
2386            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2387
2388        assert_eq!(classification.mime_type, "application/octet-stream");
2389        assert_eq!(classification.file_type, "data");
2390        assert!(classification.is_binary);
2391        assert!(!classification.is_text);
2392    }
2393
2394    #[test]
2395    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2396        let classification = classify_file_info(
2397            Path::new("utf16.json"),
2398            &[
2399                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2400            ],
2401        );
2402
2403        assert!(!classification.is_binary);
2404        assert!(classification.is_text);
2405        assert_eq!(classification.mime_type, "application/json");
2406        assert_eq!(classification.file_type, "JSON text data");
2407    }
2408
2409    #[test]
2410    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2411        let classification = classify_file_info(Path::new("true.json"), b"true");
2412
2413        assert!(!classification.is_binary);
2414        assert!(classification.is_text);
2415        assert_eq!(classification.mime_type, "application/json");
2416        assert_eq!(classification.file_type, "JSON text data");
2417    }
2418
2419    #[test]
2420    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2421        let classification = classify_file_info(
2422            Path::new("wrapped.json"),
2423            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2424        );
2425
2426        assert!(!classification.is_binary);
2427        assert!(classification.is_text);
2428        assert_eq!(classification.mime_type, "text/plain");
2429        assert_eq!(classification.file_type, "text, with no line terminators");
2430    }
2431
2432    #[test]
2433    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2434        let classification =
2435            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2436
2437        assert!(classification.is_binary);
2438        assert!(!classification.is_text);
2439        assert_eq!(classification.mime_type, "application/octet-stream");
2440        assert_eq!(classification.file_type, "data");
2441    }
2442
2443    #[test]
2444    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2445        let classification = classify_file_info(
2446            Path::new("crash.json"),
2447            &[
2448                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2449            ],
2450        );
2451
2452        assert!(classification.is_binary);
2453        assert!(!classification.is_text);
2454        assert_eq!(classification.mime_type, "application/octet-stream");
2455    }
2456
2457    #[test]
2458    fn test_classify_file_info_treats_dockerfile_as_source() {
2459        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2460
2461        assert_eq!(
2462            classification.programming_language.as_deref(),
2463            Some("Dockerfile")
2464        );
2465        assert!(classification.is_source);
2466        assert!(!classification.is_script);
2467        assert_eq!(
2468            classification.file_type,
2469            "Dockerfile source, UTF-8 Unicode text"
2470        );
2471    }
2472
2473    #[test]
2474    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2475        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2476
2477        assert_eq!(classification.programming_language, None);
2478        assert!(classification.is_text);
2479        assert!(!classification.is_source);
2480        assert!(!classification.is_script);
2481        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2482    }
2483
2484    #[test]
2485    fn test_classify_file_info_marks_supported_package_archives() {
2486        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2487
2488        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2489        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2490
2491        assert!(egg.is_archive);
2492        assert_eq!(egg.mime_type, "application/zip");
2493        assert_eq!(egg.file_type, "Zip archive data");
2494        assert!(nupkg.is_archive);
2495        assert_eq!(nupkg.mime_type, "application/zip");
2496        assert_eq!(nupkg.file_type, "Zip archive data");
2497    }
2498
2499    #[test]
2500    fn test_classify_file_info_marks_png_as_binary_media() {
2501        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2502
2503        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2504
2505        assert_eq!(classification.mime_type, "image/png");
2506        assert_eq!(classification.file_type, "PNG image data");
2507        assert!(classification.is_binary);
2508        assert!(!classification.is_text);
2509        assert!(classification.is_media);
2510        assert!(!classification.is_archive);
2511        assert!(!classification.is_source);
2512    }
2513
2514    #[test]
2515    fn test_classify_file_info_marks_pdf_as_binary_document() {
2516        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2517
2518        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2519
2520        assert_eq!(classification.mime_type, "application/pdf");
2521        assert_eq!(classification.file_type, "PDF document");
2522        assert!(classification.is_binary);
2523        assert!(!classification.is_text);
2524        assert!(!classification.is_archive);
2525        assert!(!classification.is_media);
2526    }
2527
2528    #[test]
2529    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2530        let classification =
2531            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2532
2533        assert!(classification.is_binary);
2534        assert!(!classification.is_text);
2535        assert!(!classification.is_source);
2536        assert_eq!(classification.programming_language, None);
2537    }
2538
2539    #[test]
2540    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2541        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2542
2543        assert_eq!(classification.programming_language, None);
2544        assert!(classification.is_text);
2545        assert!(!classification.is_source);
2546        assert_eq!(classification.file_type, "YAML text data");
2547    }
2548
2549    #[test]
2550    fn test_classify_file_info_classifies_common_build_manifests() {
2551        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2552        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2553        let cmake = classify_file_info(
2554            Path::new("toolchain.cmake"),
2555            b"set(CMAKE_CXX_STANDARD 20)\n",
2556        );
2557        let gitmodules = classify_file_info(
2558            Path::new(".gitmodules"),
2559            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2560        );
2561
2562        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2563        assert!(gradle.is_source);
2564        assert_eq!(gradle.mime_type, "text/plain");
2565        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2566
2567        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2568        assert!(flake.is_source);
2569        assert_eq!(flake.mime_type, "text/plain");
2570        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2571
2572        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2573        assert!(cmake.is_source);
2574        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2575
2576        assert_eq!(gitmodules.programming_language, None);
2577        assert!(gitmodules.is_text);
2578        assert!(!gitmodules.is_source);
2579        assert_eq!(gitmodules.file_type, "Git configuration text");
2580    }
2581
2582    #[test]
2583    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2584        let header = classify_file_info(
2585            Path::new("include/demo.hpp"),
2586            b"#pragma once\nclass Demo {};\n",
2587        );
2588        let ipp = classify_file_info(
2589            Path::new("include/detail/demo.ipp"),
2590            b"template <class T> void parse() {}\n",
2591        );
2592
2593        assert_eq!(header.programming_language.as_deref(), Some("C++"));
2594        assert!(header.is_source);
2595        assert!(!header.is_script);
2596        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2597
2598        assert_eq!(ipp.programming_language, None);
2599        assert!(!ipp.is_source);
2600        assert!(!ipp.is_script);
2601        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2602    }
2603
2604    #[test]
2605    fn test_classify_file_info_preserves_specific_shell_family_labels() {
2606        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2607
2608        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2609        assert!(bash.is_script);
2610        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2611    }
2612
2613    #[test]
2614    fn test_classify_file_info_marks_jamfile_as_source() {
2615        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2616
2617        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2618        assert!(jamfile.is_source);
2619        assert!(!jamfile.is_script);
2620        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2621    }
2622
2623    #[test]
2624    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2625        let classification = classify_file_info(
2626            Path::new("bin/run"),
2627            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2628        );
2629
2630        assert_eq!(
2631            classification.programming_language.as_deref(),
2632            Some("JavaScript")
2633        );
2634        assert!(classification.is_script);
2635        assert_eq!(
2636            classification.file_type,
2637            "javascript script, UTF-8 Unicode text executable"
2638        );
2639    }
2640
2641    #[test]
2642    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2643        let classification = classify_file_info(
2644            Path::new("script.py"),
2645            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2646        );
2647
2648        assert_eq!(
2649            classification.programming_language.as_deref(),
2650            Some("Python")
2651        );
2652        assert!(classification.is_script);
2653        assert_eq!(classification.file_type, "python script, text executable");
2654    }
2655
2656    #[test]
2657    fn test_classify_file_info_treats_textual_tga_as_media() {
2658        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2659
2660        assert!(classification.is_media);
2661        assert!(classification.is_text);
2662        assert!(!classification.is_binary);
2663    }
2664
2665    #[test]
2666    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2667        let classification =
2668            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2669
2670        assert!(classification.is_binary);
2671        assert!(!classification.is_text);
2672        assert!(!classification.is_source);
2673        assert_eq!(classification.programming_language, None);
2674    }
2675
2676    #[test]
2677    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2678        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2679
2680        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2681
2682        assert!(text.is_empty());
2683        assert_eq!(kind, ExtractedTextKind::None);
2684    }
2685
2686    #[test]
2687    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2688        let cases = [
2689            (
2690                Path::new("bin/run"),
2691                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2692                Some("JavaScript"),
2693                true,
2694                true,
2695            ),
2696            (
2697                Path::new("Dockerfile"),
2698                b"FROM scratch\n".as_slice(),
2699                Some("Dockerfile"),
2700                true,
2701                false,
2702            ),
2703            (
2704                Path::new("package.json"),
2705                br#"{"name":"demo"}"#.as_slice(),
2706                None,
2707                false,
2708                false,
2709            ),
2710            (
2711                Path::new("config.yaml"),
2712                b"key: value\n".as_slice(),
2713                None,
2714                false,
2715                false,
2716            ),
2717            (
2718                Path::new("Makefile"),
2719                b"all:\n\techo hi\n".as_slice(),
2720                None,
2721                false,
2722                false,
2723            ),
2724        ];
2725
2726        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2727            let classification = classify_file_info(path, bytes);
2728
2729            assert_eq!(
2730                classification.programming_language.as_deref(),
2731                expected_language,
2732                "unexpected language for {}",
2733                path.display()
2734            );
2735            assert_eq!(
2736                classification.is_source,
2737                expected_is_source,
2738                "unexpected is_source for {}",
2739                path.display()
2740            );
2741            assert_eq!(
2742                classification.is_script,
2743                expected_is_script,
2744                "unexpected is_script for {}",
2745                path.display()
2746            );
2747        }
2748    }
2749}