Skip to main content

provenant/utils/
file.rs

1use std::borrow::Cow;
2use std::collections::BTreeSet;
3use std::fs;
4use std::io::{BufReader, Cursor, Read};
5use std::panic::{AssertUnwindSafe, catch_unwind};
6use std::path::Path;
7
8use chrono::{TimeZone, Utc};
9use file_format::{FileFormat, Kind as FileFormatKind};
10use flate2::read::ZlibDecoder;
11use glob::Pattern;
12use image::{ImageDecoder, ImageFormat, ImageReader};
13use mime_guess::from_path;
14use quick_xml::events::Event;
15use quick_xml::reader::Reader as XmlReader;
16
17use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
18use crate::utils::font::extract_font_metadata_text;
19use crate::utils::language::detect_language;
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ExtractedTextKind {
23    None,
24    Decoded,
25    FontMetadata,
26    Pdf,
27    BinaryStrings,
28    ImageMetadata,
29    WindowsExecutableMetadata,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq)]
33pub struct FileInfoClassification {
34    pub mime_type: String,
35    pub file_type: String,
36    pub programming_language: Option<String>,
37    pub is_binary: bool,
38    pub is_text: bool,
39    pub is_archive: bool,
40    pub is_media: bool,
41    pub is_source: bool,
42    pub is_script: bool,
43}
44
45const MAX_IMAGE_METADATA_VALUES: usize = 64;
46const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
47const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
48const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
49const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
50const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
51    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
52];
53const BINARY_EXTENSIONS: &[&str] = &[
54    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
55];
56const ARCHIVE_EXTENSIONS: &[&str] = &[
57    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
58    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
59];
60
61/// Get the last modified date of a file as a `YYYY-MM-DD` string.
62pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
63    metadata.modified().ok().map(|time: std::time::SystemTime| {
64        let seconds_since_epoch = time
65            .duration_since(std::time::UNIX_EPOCH)
66            .unwrap()
67            .as_secs() as i64;
68
69        Utc.timestamp_opt(seconds_since_epoch, 0)
70            .single()
71            .unwrap_or_else(Utc::now)
72            .format("%Y-%m-%d")
73            .to_string()
74    })
75}
76
77/// Check if a path should be excluded based on a list of glob patterns.
78pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
79    let path_str = path.to_string_lossy();
80    let file_name = path
81        .file_name()
82        .map(|name| name.to_string_lossy())
83        .unwrap_or_default();
84
85    for pattern in exclude_patterns {
86        // Match against full path
87        if pattern.matches(&path_str) {
88            return true;
89        }
90
91        // Match against just the file/directory name
92        if pattern.matches(&file_name) {
93            return true;
94        }
95    }
96
97    false
98}
99
100/// Decode a byte buffer to a String, trying UTF-8 first, then Latin-1.
101///
102/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
103/// so it can decode any byte sequence. This matches Python ScanCode's use of
104/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
105pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
106    match String::from_utf8(bytes.to_vec()) {
107        Ok(s) => s,
108        Err(e) => {
109            let bytes = e.into_bytes();
110            if has_binary_control_chars(&bytes) {
111                return String::new();
112            }
113            bytes.iter().map(|&b| b as char).collect()
114        }
115    }
116}
117
118pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
119    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
120    (text, kind)
121}
122
123pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
124    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
125        return Cow::Borrowed(text);
126    };
127    if !matches!(
128        extension.to_ascii_lowercase().as_str(),
129        "md" | "markdown" | "html" | "htm"
130    ) {
131        return Cow::Borrowed(text);
132    }
133
134    let mut hints = Vec::new();
135    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
136        hints.push("Creative Commons Attribution 4.0 International License".to_string());
137    }
138    if text.contains("Apache License (Version 2.0)") || text.contains("Apache License, Version 2.0")
139    {
140        hints.push(
141            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
142                .to_string(),
143        );
144    }
145
146    hints.extend(extract_shields_license_badge_hints(text));
147
148    if hints.is_empty() {
149        Cow::Borrowed(text)
150    } else {
151        let mut augmented =
152            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
153        augmented.push_str(text);
154        augmented.push_str("\n\n");
155        for (index, hint) in hints.into_iter().enumerate() {
156            if index > 0 {
157                augmented.push('\n');
158            }
159            augmented.push_str(&hint);
160        }
161        Cow::Owned(augmented)
162    }
163}
164
165fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
166    let mut hints = Vec::new();
167    let mut rest = text;
168    let needle = "img.shields.io/badge/license-";
169
170    while let Some(index) = rest.find(needle) {
171        let start = index + needle.len();
172        let suffix = &rest[start..];
173        let end = suffix
174            .find([')', ']', '"', '\'', ' ', '\n'])
175            .unwrap_or(suffix.len());
176        let badge = &suffix[..end];
177        let Some(badge) = badge.strip_suffix(".svg") else {
178            rest = &suffix[end..];
179            continue;
180        };
181
182        let mut segments: Vec<_> = badge
183            .split('-')
184            .filter(|segment| !segment.is_empty())
185            .collect();
186        if segments.len() < 2 {
187            rest = &suffix[end..];
188            continue;
189        }
190        segments.pop();
191        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
192        if !candidate.is_empty() {
193            hints.push(canonical_shields_license_hint(&candidate));
194        }
195
196        rest = &suffix[end..];
197    }
198
199    hints.sort();
200    hints.dedup();
201    hints
202}
203
204fn canonical_shields_license_hint(candidate: &str) -> String {
205    match candidate.trim() {
206        "MIT" => "The MIT License".to_string(),
207        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
208        other => format!("{other} License"),
209    }
210}
211
212pub(crate) fn extract_text_for_detection_with_diagnostics(
213    path: &Path,
214    bytes: &[u8],
215) -> (String, ExtractedTextKind, Option<String>) {
216    let ext = path
217        .extension()
218        .and_then(|e| e.to_str())
219        .map(|s| s.to_ascii_lowercase());
220    let detected_format = detect_file_format(bytes);
221
222    if looks_like_rtf(bytes, ext.as_deref()) {
223        let text = extract_rtf_text(bytes);
224        return if text.trim().is_empty() {
225            (String::new(), ExtractedTextKind::None, None)
226        } else {
227            (text, ExtractedTextKind::Decoded, None)
228        };
229    }
230
231    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
232        let (text, scan_error) = extract_pdf_text(path, bytes);
233        return if text.is_empty() {
234            (String::new(), ExtractedTextKind::None, scan_error)
235        } else {
236            (text, ExtractedTextKind::Pdf, None)
237        };
238    }
239
240    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
241        let text = extract_image_metadata_text(bytes, format);
242        return if text.is_empty() {
243            if is_supported_image_container(bytes, format) {
244                (String::new(), ExtractedTextKind::None, None)
245            } else {
246                let decoded = decode_bytes_to_string(bytes);
247                if decoded.is_empty() {
248                    (String::new(), ExtractedTextKind::None, None)
249                } else {
250                    (decoded, ExtractedTextKind::Decoded, None)
251                }
252            }
253        } else {
254            (text, ExtractedTextKind::ImageMetadata, None)
255        };
256    }
257
258    if let Some(text) = extract_font_metadata_text(path, bytes) {
259        return (text, ExtractedTextKind::FontMetadata, None);
260    }
261
262    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
263    let large_opaque_binary = windows_executable_metadata_text.is_none()
264        && is_large_opaque_binary_candidate(bytes, detected_format);
265
266    if should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format) {
267        return windows_metadata_or_empty_result(windows_executable_metadata_text);
268    }
269
270    if should_skip_binary_string_extraction(path, bytes, detected_format) {
271        return (String::new(), ExtractedTextKind::None, None);
272    }
273
274    if !large_opaque_binary {
275        let decoded = decode_bytes_to_string(bytes);
276        if !decoded.is_empty() {
277            let combined =
278                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
279            return (combined, ExtractedTextKind::Decoded, None);
280        }
281    }
282
283    let text = if large_opaque_binary {
284        extract_sampled_printable_strings(bytes)
285    } else {
286        extract_printable_strings(bytes)
287    };
288    if text.is_empty() {
289        windows_metadata_or_empty_result(windows_executable_metadata_text)
290    } else {
291        (
292            combine_extracted_text_fragments(windows_executable_metadata_text, text),
293            ExtractedTextKind::BinaryStrings,
294            None,
295        )
296    }
297}
298
299fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
300    match prefix {
301        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
302        Some(prefix) if !prefix.is_empty() => prefix,
303        _ => suffix,
304    }
305}
306
307fn windows_metadata_or_empty_result(
308    windows_executable_metadata_text: Option<String>,
309) -> (String, ExtractedTextKind, Option<String>) {
310    if let Some(metadata_text) = windows_executable_metadata_text {
311        (
312            metadata_text,
313            ExtractedTextKind::WindowsExecutableMetadata,
314            None,
315        )
316    } else {
317        (String::new(), ExtractedTextKind::None, None)
318    }
319}
320
321pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
322    let detected_format = detect_file_format(bytes);
323    let detected_language = detect_language(path, bytes);
324    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
325    let is_text = !is_binary;
326    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
327    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
328    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
329    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
330    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
331    let programming_language = is_source.then(|| detected_language.clone()).flatten();
332    let file_type = detect_file_type(
333        path,
334        bytes,
335        detected_format,
336        &mime_type,
337        programming_language.as_deref(),
338        is_binary,
339        is_text,
340        is_archive,
341        is_media,
342        is_script,
343    );
344
345    FileInfoClassification {
346        mime_type,
347        file_type,
348        programming_language,
349        is_binary,
350        is_text,
351        is_archive,
352        is_media,
353        is_source,
354        is_script,
355    }
356}
357
358fn detect_file_format(bytes: &[u8]) -> FileFormat {
359    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
360}
361
362fn is_utf8_text(bytes: &[u8]) -> bool {
363    std::str::from_utf8(bytes).is_ok()
364}
365
366fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
367    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
368        return None;
369    }
370
371    let (is_le, body) = match bytes {
372        [0xFF, 0xFE, rest @ ..] => (true, rest),
373        [0xFE, 0xFF, rest @ ..] => (false, rest),
374        _ => return None,
375    };
376
377    if body.is_empty() || body.len() % 2 != 0 {
378        return None;
379    }
380
381    let code_units: Vec<u16> = body
382        .chunks_exact(2)
383        .map(|chunk| {
384            if is_le {
385                u16::from_le_bytes([chunk[0], chunk[1]])
386            } else {
387                u16::from_be_bytes([chunk[0], chunk[1]])
388            }
389        })
390        .collect();
391
392    std::char::decode_utf16(code_units)
393        .collect::<Result<String, _>>()
394        .ok()
395}
396
397fn has_binary_control_chars(bytes: &[u8]) -> bool {
398    let control_count = bytes
399        .iter()
400        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
401        .count();
402    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
403}
404
405fn has_decodable_text(bytes: &[u8]) -> bool {
406    bytes.is_empty()
407        || is_utf8_text(bytes)
408        || decode_utf16_bom_text(bytes).is_some()
409        || !has_binary_control_chars(bytes)
410}
411
412fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
413    if bytes.is_empty() || is_utf8_text(bytes) {
414        return true;
415    }
416    if let Some(decoded) = decode_utf16_bom_text(bytes) {
417        return decoded
418            .chars()
419            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
420    }
421
422    let printable_count = bytes
423        .iter()
424        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
425        .count();
426    printable_count * 2 >= bytes.len()
427}
428
429fn is_textual_media_type(media_type: &str) -> bool {
430    media_type.starts_with("text/")
431        || matches!(
432            media_type,
433            "application/json" | "application/xml" | "text/xml"
434        )
435        || media_type.ends_with("+json")
436        || media_type.ends_with("+xml")
437}
438
439fn is_textual_format(detected_format: FileFormat) -> bool {
440    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
441        || is_textual_media_type(detected_format.media_type())
442}
443
444fn is_known_binary_format(detected_format: FileFormat) -> bool {
445    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
446        && !is_textual_format(detected_format)
447}
448
449pub fn detect_mime_type(
450    path: &Path,
451    bytes: &[u8],
452    detected_format: FileFormat,
453    programming_language: Option<&str>,
454) -> String {
455    if bytes.is_empty() {
456        return "inode/x-empty".to_string();
457    }
458
459    if lower_extension(path).as_deref() == Some("json") {
460        if let Some(is_binary) = json_binary_override(bytes) {
461            if is_binary {
462                return "application/octet-stream".to_string();
463            }
464            if has_valid_json_text(bytes) {
465                return "application/json".to_string();
466            }
467            return "text/plain".to_string();
468        }
469        if has_valid_json_text(bytes) {
470            return "application/json".to_string();
471        }
472        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
473            return "text/plain".to_string();
474        }
475        return "application/octet-stream".to_string();
476    }
477
478    if is_zip_archive(bytes) {
479        return detect_zip_like_mime(path);
480    }
481
482    if looks_like_deb(bytes, path) {
483        return "application/vnd.debian.binary-package".to_string();
484    }
485
486    if looks_like_rpm(bytes, path) {
487        return "application/x-rpm".to_string();
488    }
489
490    let guessed_mime = from_path(path)
491        .first_or_octet_stream()
492        .essence_str()
493        .to_string();
494
495    let mime_type = match detected_format {
496        FileFormat::Empty => "inode/x-empty".to_string(),
497        FileFormat::PlainText => {
498            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
499                "text/plain".to_string()
500            } else {
501                guessed_mime.clone()
502            }
503        }
504        _ => {
505            let detected_mime = detected_format.media_type();
506            if detected_mime == "application/octet-stream"
507                && guessed_mime != "application/octet-stream"
508            {
509                guessed_mime.clone()
510            } else {
511                detected_mime.to_string()
512            }
513        }
514    };
515
516    normalize_mime_type(path, bytes, programming_language, &mime_type)
517}
518
519fn normalize_mime_type(
520    path: &Path,
521    bytes: &[u8],
522    programming_language: Option<&str>,
523    mime_type: &str,
524) -> String {
525    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
526        return "text/plain".to_string();
527    }
528
529    mime_type.to_string()
530}
531
532fn should_prefer_text_mime(
533    path: &Path,
534    bytes: &[u8],
535    programming_language: Option<&str>,
536    mime_type: &str,
537) -> bool {
538    has_decodable_text(bytes)
539        && looks_like_textual_bytes(bytes)
540        && is_textual_source_candidate(path, programming_language)
541        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
542}
543
544fn has_valid_json_text(bytes: &[u8]) -> bool {
545    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
546        return false;
547    }
548
549    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
550        || decode_utf16_bom_text(bytes)
551            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
552            .is_some()
553}
554
555fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
556    !bytes.contains(&0)
557        && !bytes.contains(&0xFF)
558        && bytes.starts_with(b"[\"")
559        && bytes.ends_with(b"\"]")
560        && bytes.len() >= 8
561}
562
563fn json_binary_override(bytes: &[u8]) -> Option<bool> {
564    if has_valid_json_text(bytes) || decode_utf16_bom_text(bytes).is_some() {
565        return Some(false);
566    }
567
568    if bytes.contains(&0) {
569        return Some(true);
570    }
571
572    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
573        return Some(true);
574    }
575
576    if is_wrapped_invalid_json_string_text(bytes) {
577        return Some(false);
578    }
579
580    None
581}
582
583fn detect_is_binary(
584    path: &Path,
585    bytes: &[u8],
586    detected_format: FileFormat,
587    programming_language: Option<&str>,
588) -> bool {
589    if lower_extension(path).as_deref() == Some("json")
590        && let Some(is_binary) = json_binary_override(bytes)
591    {
592        return is_binary;
593    }
594
595    if is_textual_format(detected_format) {
596        return false;
597    }
598
599    if lower_extension(path)
600        .as_deref()
601        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
602    {
603        return true;
604    }
605
606    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
607        return false;
608    }
609
610    has_binary_control_chars(bytes)
611        || is_known_binary_format(detected_format)
612        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
613            && !looks_like_textual_bytes(bytes))
614}
615
616fn should_treat_binary_bytes_as_text(
617    path: &Path,
618    bytes: &[u8],
619    programming_language: Option<&str>,
620) -> bool {
621    has_decodable_text(bytes)
622        && looks_like_textual_bytes(bytes)
623        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
624}
625
626fn detect_is_archive(
627    path: &Path,
628    bytes: &[u8],
629    mime_type: &str,
630    is_text: bool,
631    detected_format: FileFormat,
632) -> bool {
633    if is_text {
634        return false;
635    }
636
637    lower_extension(path)
638        .as_deref()
639        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
640        || matches!(
641            detected_format.kind(),
642            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
643        )
644        || is_zip_archive(bytes)
645        || looks_like_gzip(bytes)
646        || looks_like_bzip2(bytes)
647        || looks_like_xz(bytes)
648        || looks_like_deb(bytes, path)
649        || looks_like_rpm(bytes, path)
650        || looks_like_squashfs(bytes, path)
651        || mime_type.contains("zip")
652        || mime_type.contains("compressed")
653        || mime_type.contains("tar")
654        || mime_type.contains("x-rpm")
655        || mime_type.contains("debian")
656}
657
658fn detect_is_media(
659    path: &Path,
660    bytes: &[u8],
661    mime_type: &str,
662    detected_format: FileFormat,
663) -> bool {
664    media_mime_from_content(bytes).is_some()
665        || matches!(
666            detected_format.kind(),
667            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
668        )
669        || mime_type.starts_with("image/")
670        || mime_type.starts_with("audio/")
671        || mime_type.starts_with("video/")
672        || (mime_type == "application/octet-stream"
673            && lower_extension(path).as_deref() == Some("tga")
674            && !has_binary_control_chars(bytes))
675}
676
677fn detect_is_script(
678    path: &Path,
679    bytes: &[u8],
680    programming_language: Option<&str>,
681    is_text: bool,
682) -> bool {
683    if !is_text || is_makefile(path) {
684        return false;
685    }
686
687    bytes.starts_with(b"#!")
688        || lower_extension(path).as_deref().is_some_and(|ext| {
689            matches!(
690                ext,
691                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
692            )
693        })
694        || matches!(
695            programming_language,
696            Some(
697                "Shell"
698                    | "Bash"
699                    | "Zsh"
700                    | "Fish"
701                    | "Ksh"
702                    | "Python"
703                    | "Ruby"
704                    | "Perl"
705                    | "PHP"
706                    | "PowerShell"
707                    | "Awk"
708            )
709        )
710}
711
712fn detect_is_source(
713    path: &Path,
714    programming_language: Option<&str>,
715    is_text: bool,
716    is_script: bool,
717) -> bool {
718    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
719        return false;
720    }
721
722    if is_c_like_source(path) || is_java_like_source(path) {
723        return true;
724    }
725
726    programming_language.is_some() || is_script
727}
728
729#[allow(clippy::too_many_arguments)]
730fn detect_file_type(
731    path: &Path,
732    bytes: &[u8],
733    detected_format: FileFormat,
734    mime_type: &str,
735    programming_language: Option<&str>,
736    is_binary: bool,
737    is_text: bool,
738    is_archive: bool,
739    is_media: bool,
740    is_script: bool,
741) -> String {
742    if bytes.is_empty() {
743        return "empty".to_string();
744    }
745
746    if looks_like_pdf(bytes) {
747        return "PDF document".to_string();
748    }
749
750    if let Some(file_type) = media_file_type_from_content(bytes) {
751        return file_type.to_string();
752    }
753
754    if is_archive {
755        return archive_file_type(path, bytes, detected_format);
756    }
757
758    if is_script {
759        return script_file_type(programming_language, bytes);
760    }
761
762    if is_text {
763        if lower_extension(path).as_deref() == Some("json") {
764            if has_valid_json_text(bytes) {
765                return "JSON text data".to_string();
766            }
767            return text_file_type(bytes);
768        }
769        if lower_extension(path).as_deref() == Some("xml") {
770            return "XML text data".to_string();
771        }
772        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
773            return "YAML text data".to_string();
774        }
775        if lower_extension(path).as_deref() == Some("toml") {
776            return "TOML text data".to_string();
777        }
778        if matches!(
779            lower_extension(path).as_deref(),
780            Some("ini" | "cfg" | "conf")
781        ) {
782            return "INI text data".to_string();
783        }
784        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
785            return "Git configuration text".to_string();
786        }
787        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
788            return text_file_type(bytes);
789        }
790        if programming_language.is_some() && !is_media {
791            return source_file_type(programming_language, bytes);
792        }
793        return text_file_type(bytes);
794    }
795
796    if let Some(file_type) = format_based_file_type(detected_format) {
797        return file_type;
798    }
799
800    if is_binary && mime_type == "application/octet-stream" {
801        return "data".to_string();
802    }
803
804    mime_type.to_string()
805}
806
807fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
808    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
809        return true;
810    }
811
812    if matches!(
813        lower_file_name(path).as_str(),
814        "dockerfile"
815            | "containerfile"
816            | "containerfile.core"
817            | "apkbuild"
818            | "podfile"
819            | "jamfile"
820            | "jamroot"
821            | "meson.build"
822            | "build"
823            | "workspace"
824            | "buck"
825            | "default.nix"
826            | "flake.nix"
827            | "shell.nix"
828    ) {
829        return true;
830    }
831
832    path.extension()
833        .and_then(|ext| ext.to_str())
834        .is_some_and(|ext| {
835            matches!(
836                ext.to_ascii_lowercase().as_str(),
837                "rs" | "py"
838                    | "js"
839                    | "mjs"
840                    | "cjs"
841                    | "jsx"
842                    | "ts"
843                    | "mts"
844                    | "cts"
845                    | "tsx"
846                    | "c"
847                    | "cpp"
848                    | "cc"
849                    | "cxx"
850                    | "h"
851                    | "hpp"
852                    | "m"
853                    | "mm"
854                    | "s"
855                    | "asm"
856                    | "java"
857                    | "go"
858                    | "rb"
859                    | "php"
860                    | "pl"
861                    | "swift"
862                    | "sh"
863                    | "bash"
864                    | "zsh"
865                    | "fish"
866                    | "ksh"
867                    | "ps1"
868                    | "psm1"
869                    | "psd1"
870                    | "awk"
871                    | "kt"
872                    | "kts"
873                    | "dart"
874                    | "scala"
875                    | "groovy"
876                    | "gradle"
877                    | "gvy"
878                    | "gy"
879                    | "gsh"
880                    | "cs"
881                    | "fs"
882                    | "fsx"
883                    | "r"
884                    | "lua"
885                    | "jl"
886                    | "ex"
887                    | "exs"
888                    | "clj"
889                    | "cljs"
890                    | "cljc"
891                    | "hs"
892                    | "erl"
893                    | "nix"
894                    | "zig"
895                    | "bzl"
896                    | "bazel"
897                    | "star"
898                    | "sky"
899                    | "ml"
900                    | "mli"
901                    | "tex"
902            )
903        })
904}
905
906fn is_source_like_language(language: &str) -> bool {
907    matches!(
908        language,
909        "Rust"
910            | "Python"
911            | "JavaScript"
912            | "TypeScript"
913            | "JavaScript/TypeScript"
914            | "C"
915            | "C++"
916            | "Objective-C"
917            | "Objective-C++"
918            | "GAS"
919            | "Java"
920            | "Go"
921            | "Ruby"
922            | "PHP"
923            | "Perl"
924            | "Swift"
925            | "Shell"
926            | "PowerShell"
927            | "Awk"
928            | "Kotlin"
929            | "Dart"
930            | "Scala"
931            | "C#"
932            | "F#"
933            | "R"
934            | "Lua"
935            | "Julia"
936            | "Elixir"
937            | "Clojure"
938            | "Haskell"
939            | "Erlang"
940            | "Groovy"
941            | "Nix"
942            | "Zig"
943            | "Starlark"
944            | "OCaml"
945            | "Meson"
946            | "TeX"
947            | "Dockerfile"
948            | "Makefile"
949            | "Jamfile"
950    )
951}
952
953fn extension(path: &Path) -> Option<&str> {
954    path.extension().and_then(|ext| ext.to_str())
955}
956
957fn lower_extension(path: &Path) -> Option<String> {
958    extension(path).map(|ext| ext.to_ascii_lowercase())
959}
960
961fn lower_file_name(path: &Path) -> String {
962    path.file_name()
963        .and_then(|name| name.to_str())
964        .map(|name| name.to_ascii_lowercase())
965        .unwrap_or_default()
966}
967
968fn is_plain_text(path: &Path) -> bool {
969    lower_extension(path)
970        .as_deref()
971        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
972}
973
974fn is_makefile(path: &Path) -> bool {
975    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
976}
977
978fn is_source_map(path: &Path) -> bool {
979    let path_lower = path.to_string_lossy().to_ascii_lowercase();
980    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
981}
982
983fn is_c_like_source(path: &Path) -> bool {
984    lower_extension(path).as_deref().is_some_and(|ext| {
985        matches!(
986            ext,
987            "c" | "cc"
988                | "cp"
989                | "cpp"
990                | "cxx"
991                | "c++"
992                | "h"
993                | "hh"
994                | "hpp"
995                | "hxx"
996                | "h++"
997                | "i"
998                | "ii"
999                | "m"
1000                | "s"
1001                | "asm"
1002        )
1003    })
1004}
1005
1006fn is_java_like_source(path: &Path) -> bool {
1007    lower_extension(path)
1008        .as_deref()
1009        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
1010}
1011
1012fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
1013    match detected_format {
1014        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
1015        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
1016        format => Some(match format.kind() {
1017            FileFormatKind::Image => short_name_or_name(&format, "image data"),
1018            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
1019            FileFormatKind::Video => short_name_or_name(&format, "video data"),
1020            _ => format.name().to_string(),
1021        }),
1022    }
1023}
1024
1025fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
1026    format
1027        .short_name()
1028        .map(|short_name| format!("{short_name} {suffix}"))
1029        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
1030}
1031
1032fn detect_zip_like_mime(path: &Path) -> String {
1033    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
1034        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
1035        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
1036            "application/java-archive".to_string()
1037        }
1038        _ => "application/zip".to_string(),
1039    }
1040}
1041
1042fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
1043    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1044        Some("image/png")
1045    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1046        Some("image/jpeg")
1047    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1048        Some("image/tiff")
1049    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1050        Some("image/webp")
1051    } else {
1052        None
1053    }
1054}
1055
1056fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
1057    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
1058        Some("PNG image data")
1059    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
1060        Some("JPEG image data")
1061    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
1062        Some("TIFF image data")
1063    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
1064        Some("WebP image data")
1065    } else {
1066        None
1067    }
1068}
1069
1070fn looks_like_pdf(bytes: &[u8]) -> bool {
1071    bytes.starts_with(b"%PDF-")
1072}
1073
1074fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
1075    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
1076}
1077
1078fn extract_rtf_text(bytes: &[u8]) -> String {
1079    let text = String::from_utf8_lossy(bytes);
1080    let chars: Vec<char> = text.chars().collect();
1081    let mut output = String::new();
1082    let mut index = 0usize;
1083
1084    while index < chars.len() {
1085        match chars[index] {
1086            '{' | '}' => {
1087                index += 1;
1088            }
1089            '\\' => {
1090                index += 1;
1091                if index >= chars.len() {
1092                    break;
1093                }
1094
1095                match chars[index] {
1096                    '\\' | '{' | '}' => {
1097                        output.push(chars[index]);
1098                        index += 1;
1099                    }
1100                    '\'' => {
1101                        if index + 2 < chars.len() {
1102                            let hex = [chars[index + 1], chars[index + 2]];
1103                            let hex: String = hex.iter().collect();
1104                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
1105                                output.push(value as char);
1106                                index += 3;
1107                                continue;
1108                            }
1109                        }
1110                        index += 1;
1111                    }
1112                    control if control.is_ascii_alphabetic() => {
1113                        let start = index;
1114                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
1115                            index += 1;
1116                        }
1117                        let control_word: String = chars[start..index].iter().collect();
1118
1119                        let number_start = index;
1120                        if index < chars.len()
1121                            && (chars[index] == '-' || chars[index].is_ascii_digit())
1122                        {
1123                            index += 1;
1124                            while index < chars.len() && chars[index].is_ascii_digit() {
1125                                index += 1;
1126                            }
1127                        }
1128                        let parameter: String = chars[number_start..index].iter().collect();
1129
1130                        if index < chars.len() && chars[index] == ' ' {
1131                            index += 1;
1132                        }
1133
1134                        match control_word.as_str() {
1135                            "par" | "line" => output.push('\n'),
1136                            "tab" => output.push('\t'),
1137                            "emdash" => output.push('—'),
1138                            "endash" => output.push('–'),
1139                            "bullet" => output.push('•'),
1140                            "lquote" | "rquote" => output.push('\''),
1141                            "ldblquote" | "rdblquote" => output.push('"'),
1142                            "u" => {
1143                                if let Ok(codepoint) = parameter.parse::<i32>() {
1144                                    let normalized = if codepoint < 0 {
1145                                        codepoint + 65_536
1146                                    } else {
1147                                        codepoint
1148                                    };
1149                                    if let Ok(normalized) = u32::try_from(normalized)
1150                                        && let Some(ch) = char::from_u32(normalized)
1151                                    {
1152                                        output.push(ch);
1153                                    }
1154                                }
1155
1156                                if index < chars.len()
1157                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
1158                                {
1159                                    index += 1;
1160                                }
1161                            }
1162                            _ => {}
1163                        }
1164                    }
1165                    _ => {
1166                        index += 1;
1167                    }
1168                }
1169            }
1170            ch => {
1171                output.push(ch);
1172                index += 1;
1173            }
1174        }
1175    }
1176
1177    output
1178        .replace(['\r', '\u{0c}'], "\n")
1179        .lines()
1180        .map(str::trim_end)
1181        .collect::<Vec<_>>()
1182        .join("\n")
1183}
1184
1185fn looks_like_gzip(bytes: &[u8]) -> bool {
1186    bytes.starts_with(&[0x1f, 0x8b])
1187}
1188
1189fn looks_like_bzip2(bytes: &[u8]) -> bool {
1190    bytes.starts_with(b"BZh")
1191}
1192
1193fn looks_like_xz(bytes: &[u8]) -> bool {
1194    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
1195}
1196
1197fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
1198    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
1199}
1200
1201fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
1202    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
1203}
1204
1205fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
1206    lower_extension(path)
1207        .as_deref()
1208        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
1209        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
1210            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
1211}
1212
1213fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
1214    if looks_like_deb(bytes, path) {
1215        "debian binary package (format 2.0)".to_string()
1216    } else if looks_like_rpm(bytes, path) {
1217        "RPM package".to_string()
1218    } else if looks_like_squashfs(bytes, path) {
1219        "Squashfs filesystem".to_string()
1220    } else if looks_like_gzip(bytes) {
1221        "gzip compressed data".to_string()
1222    } else if looks_like_bzip2(bytes) {
1223        "bzip2 compressed data".to_string()
1224    } else if looks_like_xz(bytes) {
1225        "XZ compressed data".to_string()
1226    } else if is_zip_archive(bytes) {
1227        "Zip archive data".to_string()
1228    } else if lower_extension(path).as_deref() == Some("gem") {
1229        "POSIX tar archive".to_string()
1230    } else if let Some(file_type) = format_based_file_type(detected_format) {
1231        file_type
1232    } else {
1233        "archive data".to_string()
1234    }
1235}
1236
1237fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1238    let suffix = text_executable_label(bytes);
1239
1240    match programming_language {
1241        Some("Python") => format!("python script, {suffix}"),
1242        Some("Ruby") => format!("ruby script, {suffix}"),
1243        Some("Perl") => format!("perl script, {suffix}"),
1244        Some("PHP") => format!("php script, {suffix}"),
1245        Some("Shell") => format!("shell script, {suffix}"),
1246        Some("Bash") => format!("bash script, {suffix}"),
1247        Some("Zsh") => format!("zsh script, {suffix}"),
1248        Some("Fish") => format!("fish script, {suffix}"),
1249        Some("Ksh") => format!("ksh script, {suffix}"),
1250        Some("JavaScript") => format!("javascript script, {suffix}"),
1251        Some("TypeScript") => format!("typescript script, {suffix}"),
1252        Some("PowerShell") => format!("powershell script, {suffix}"),
1253        Some("Awk") => format!("awk script, {suffix}"),
1254        _ => format!("script, {suffix}"),
1255    }
1256}
1257
1258fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
1259    let suffix = text_label(bytes);
1260    match programming_language {
1261        Some("C") => format!("C source, {suffix}"),
1262        Some("C++") => format!("C++ source, {suffix}"),
1263        Some("Java") => format!("Java source, {suffix}"),
1264        Some("C#") => format!("C# source, {suffix}"),
1265        Some("F#") => format!("F# source, {suffix}"),
1266        Some("Go") => format!("Go source, {suffix}"),
1267        Some("Rust") => format!("Rust source, {suffix}"),
1268        Some("Starlark") => format!("Starlark source, {suffix}"),
1269        Some("CMake") => format!("CMake source, {suffix}"),
1270        Some("Meson") => format!("Meson source, {suffix}"),
1271        Some("Nix") => format!("Nix source, {suffix}"),
1272        Some("Groovy") => format!("Groovy source, {suffix}"),
1273        Some("Makefile") => format!("Makefile source, {suffix}"),
1274        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
1275        Some("Jamfile") => format!("Jamfile source, {suffix}"),
1276        Some("Batchfile") => format!("Batchfile source, {suffix}"),
1277        Some(language) => format!("{language} source, {suffix}"),
1278        None => text_file_type(bytes),
1279    }
1280}
1281
1282fn text_file_type(bytes: &[u8]) -> String {
1283    text_label(bytes).to_string()
1284}
1285
1286fn text_label(bytes: &[u8]) -> &'static str {
1287    if std::str::from_utf8(bytes).is_ok() {
1288        if bytes.contains(&b'\n') {
1289            "UTF-8 Unicode text"
1290        } else {
1291            "UTF-8 Unicode text, with no line terminators"
1292        }
1293    } else if bytes.contains(&b'\n') {
1294        "text"
1295    } else {
1296        "text, with no line terminators"
1297    }
1298}
1299
1300fn text_executable_label(bytes: &[u8]) -> &'static str {
1301    if std::str::from_utf8(bytes).is_ok() {
1302        if bytes.contains(&b'\n') {
1303            "UTF-8 Unicode text executable"
1304        } else {
1305            "UTF-8 Unicode text executable, with no line terminators"
1306        }
1307    } else if bytes.contains(&b'\n') {
1308        "text executable"
1309    } else {
1310        "text executable, with no line terminators"
1311    }
1312}
1313
1314fn supported_image_metadata_format(
1315    ext: Option<&str>,
1316    detected_format: FileFormat,
1317) -> Option<ImageFormat> {
1318    match ext {
1319        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
1320        Some("png") => Some(ImageFormat::Png),
1321        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
1322        Some("webp") => Some(ImageFormat::WebP),
1323        _ => match detected_format.media_type() {
1324            "image/jpeg" => Some(ImageFormat::Jpeg),
1325            "image/png" => Some(ImageFormat::Png),
1326            "image/tiff" => Some(ImageFormat::Tiff),
1327            "image/webp" => Some(ImageFormat::WebP),
1328            _ => None,
1329        },
1330    }
1331}
1332
1333fn should_skip_binary_string_extraction(
1334    path: &Path,
1335    bytes: &[u8],
1336    detected_format: FileFormat,
1337) -> bool {
1338    matches!(lower_extension(path).as_deref(), Some("pdf"))
1339        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
1340            .is_some()
1341        || (matches!(
1342            detected_format.kind(),
1343            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
1344        ) && !is_textual_format(detected_format))
1345        || media_mime_from_content(bytes).is_some()
1346        || is_zip_archive(bytes)
1347        || looks_like_gzip(bytes)
1348        || looks_like_bzip2(bytes)
1349        || looks_like_xz(bytes)
1350        || looks_like_deb(bytes, path)
1351        || looks_like_rpm(bytes, path)
1352        || looks_like_squashfs(bytes, path)
1353}
1354
1355fn should_skip_large_opaque_binary_text_extraction(
1356    _path: &Path,
1357    bytes: &[u8],
1358    detected_format: FileFormat,
1359) -> bool {
1360    is_large_opaque_binary_candidate(bytes, detected_format)
1361        && !sample_has_promising_printable_strings(bytes)
1362}
1363
1364fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
1365    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
1366        && !is_textual_format(detected_format)
1367        && !matches!(
1368            detected_format.kind(),
1369            FileFormatKind::Archive
1370                | FileFormatKind::Compressed
1371                | FileFormatKind::Package
1372                | FileFormatKind::Audio
1373                | FileFormatKind::Image
1374                | FileFormatKind::Video
1375        )
1376}
1377
1378fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
1379    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;
1380
1381    let mut ranges = Vec::new();
1382    let mut push_range = |start: usize, end: usize| {
1383        if start < end && !ranges.contains(&(start, end)) {
1384            ranges.push((start, end));
1385        }
1386    };
1387
1388    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
1389    if len > SAMPLE_WINDOW_BYTES * 2 {
1390        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
1391        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
1392        push_range(mid_start, mid_end);
1393    }
1394    if len > SAMPLE_WINDOW_BYTES {
1395        push_range(len - SAMPLE_WINDOW_BYTES, len);
1396    }
1397
1398    ranges
1399}
1400
1401fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
1402    let mut structured_signal_seen = false;
1403    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
1404        .into_iter()
1405        .filter(|&(start, end)| {
1406            let window = &bytes[start..end];
1407            if has_strong_structured_text_signal(window) {
1408                structured_signal_seen = true;
1409            }
1410            has_license_or_notice_signal(window)
1411        })
1412        .count();
1413
1414    structured_signal_seen || promising_license_windows >= 2
1415}
1416
1417fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
1418    let mut combined_lines = BTreeSet::new();
1419
1420    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
1421        let window_text = extract_printable_strings(&bytes[start..end]);
1422        for line in window_text
1423            .lines()
1424            .map(str::trim)
1425            .filter(|line| !line.is_empty())
1426        {
1427            combined_lines.insert(line.to_string());
1428        }
1429    }
1430
1431    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
1432}
1433
1434fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
1435    let strings = extract_printable_strings(bytes);
1436    if strings.is_empty() {
1437        return false;
1438    }
1439
1440    let lower = strings.to_ascii_lowercase();
1441    [
1442        "copyright",
1443        "license",
1444        "licensed under",
1445        "all rights reserved",
1446        "permission is hereby granted",
1447        "redistribution and use",
1448        "spdx-license-identifier",
1449    ]
1450    .iter()
1451    .any(|marker| lower.contains(marker))
1452}
1453
1454fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
1455    let strings = extract_printable_strings(bytes);
1456    if strings.is_empty() {
1457        return false;
1458    }
1459
1460    let email_markers = strings.matches('@').count();
1461    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();
1462
1463    email_markers + url_markers >= 3
1464}
1465
1466fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
1467    match format {
1468        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
1469        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
1470        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
1471        ImageFormat::WebP => {
1472            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
1473        }
1474        _ => false,
1475    }
1476}
1477
1478fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
1479    let mut values = Vec::new();
1480    values.extend(extract_exif_metadata_values(bytes));
1481    values.extend(extract_xmp_metadata_values(bytes, format));
1482    values_to_text(values)
1483}
1484
1485fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
1486    let mut cursor = BufReader::new(Cursor::new(bytes));
1487    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
1488        Ok(exif) => exif,
1489        Err(_) => return Vec::new(),
1490    };
1491
1492    let mut values = Vec::new();
1493    for field in exif.fields() {
1494        let rendered = match field.tag {
1495            exif::Tag::ImageDescription | exif::Tag::Copyright | exif::Tag::UserComment => {
1496                Some(field.display_value().with_unit(&exif).to_string())
1497            }
1498            exif::Tag::Artist => Some(format!(
1499                "Author: {}",
1500                field.display_value().with_unit(&exif)
1501            )),
1502            _ => None,
1503        };
1504
1505        if let Some(rendered) = rendered {
1506            values.push(rendered);
1507        }
1508    }
1509
1510    values
1511}
1512
1513fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
1514    let xmp = match extract_raw_xmp_packet(bytes, format) {
1515        Some(xmp) => xmp,
1516        None => return Vec::new(),
1517    };
1518
1519    parse_xmp_values(&xmp)
1520}
1521
1522fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
1523    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
1524    if let Ok(mut decoder) = reader.into_decoder()
1525        && let Ok(Some(xmp)) = decoder.xmp_metadata()
1526    {
1527        return Some(xmp);
1528    }
1529
1530    match format {
1531        ImageFormat::Png => extract_png_xmp_packet(bytes),
1532        _ => None,
1533    }
1534}
1535
1536fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
1537    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";
1538
1539    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
1540        return None;
1541    }
1542
1543    let mut offset = PNG_SIGNATURE.len();
1544    while offset + 12 <= bytes.len() {
1545        let length = u32::from_be_bytes([
1546            bytes[offset],
1547            bytes[offset + 1],
1548            bytes[offset + 2],
1549            bytes[offset + 3],
1550        ]) as usize;
1551        let chunk_start = offset + 8;
1552        let chunk_end = chunk_start + length;
1553        if chunk_end + 4 > bytes.len() {
1554            return None;
1555        }
1556
1557        let chunk_type = &bytes[offset + 4..offset + 8];
1558        if chunk_type == b"iTXt" {
1559            let data = &bytes[chunk_start..chunk_end];
1560            if let Some(xmp) = parse_png_itxt_xmp(data) {
1561                return Some(xmp);
1562            }
1563        }
1564
1565        offset = chunk_end + 4;
1566    }
1567
1568    None
1569}
1570
1571fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
1572    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";
1573
1574    let keyword_end = data.iter().position(|&b| b == 0)?;
1575    if &data[..keyword_end] != XMP_KEYWORD {
1576        return None;
1577    }
1578
1579    let mut cursor = keyword_end + 1;
1580    let compression_flag = *data.get(cursor)?;
1581    cursor += 1;
1582    let compression_method = *data.get(cursor)?;
1583    cursor += 1;
1584    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
1585        return None;
1586    }
1587
1588    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1589    cursor = language_end + 1;
1590
1591    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
1592    cursor = translated_end + 1;
1593
1594    let text_bytes = &data[cursor..];
1595    if compression_flag == 1 {
1596        let mut decoder = ZlibDecoder::new(text_bytes);
1597        let mut decoded = Vec::new();
1598        decoder.read_to_end(&mut decoded).ok()?;
1599        Some(decoded)
1600    } else {
1601        Some(text_bytes.to_vec())
1602    }
1603}
1604
1605fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
1606    let mut reader = XmlReader::from_reader(xmp);
1607    reader.config_mut().trim_text(true);
1608
1609    let mut buf = Vec::new();
1610    let mut stack: Vec<String> = Vec::new();
1611    let mut values = Vec::new();
1612
1613    loop {
1614        match reader.read_event_into(&mut buf) {
1615            Ok(Event::Start(e)) => {
1616                stack.push(local_xml_name(e.name().as_ref()));
1617            }
1618            Ok(Event::End(_)) => {
1619                stack.pop();
1620            }
1621            Ok(Event::Empty(_)) => {}
1622            Ok(Event::Text(text)) => {
1623                if let Some(field) = stack
1624                    .iter()
1625                    .rev()
1626                    .find_map(|name| allowed_xmp_field(name.as_str()))
1627                    && let Ok(decoded) = text.decode()
1628                {
1629                    let decoded = decoded.into_owned();
1630                    if !decoded.trim().is_empty() {
1631                        values.push(format_xmp_value(field, &decoded));
1632                    }
1633                }
1634            }
1635            Ok(Event::CData(text)) => {
1636                if let Some(field) = stack
1637                    .iter()
1638                    .rev()
1639                    .find_map(|name| allowed_xmp_field(name.as_str()))
1640                    && let Ok(decoded) = text.decode()
1641                {
1642                    let decoded = decoded.into_owned();
1643                    if !decoded.trim().is_empty() {
1644                        values.push(format_xmp_value(field, &decoded));
1645                    }
1646                }
1647            }
1648            Ok(Event::Eof) | Err(_) => break,
1649            _ => {}
1650        }
1651        buf.clear();
1652    }
1653
1654    values
1655}
1656
1657fn local_xml_name(name: &[u8]) -> String {
1658    let name = std::str::from_utf8(name).unwrap_or_default();
1659    name.rsplit(':').next().unwrap_or(name).to_string()
1660}
1661
1662fn allowed_xmp_field(name: &str) -> Option<&'static str> {
1663    match name {
1664        "creator" => Some("creator"),
1665        "rights" => Some("rights"),
1666        "description" => Some("description"),
1667        "title" => Some("title"),
1668        "subject" => Some("subject"),
1669        "UsageTerms" => Some("usage_terms"),
1670        "WebStatement" => Some("web_statement"),
1671        _ => None,
1672    }
1673}
1674
1675fn format_xmp_value(field: &str, value: &str) -> String {
1676    match field {
1677        "creator" => format!("Author: {value}"),
1678        _ => value.to_string(),
1679    }
1680}
1681
1682fn values_to_text(values: Vec<String>) -> String {
1683    let mut seen = BTreeSet::new();
1684    let mut lines = Vec::new();
1685    let mut total_bytes = 0usize;
1686
1687    for value in values {
1688        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
1689            break;
1690        }
1691
1692        let normalized = normalize_metadata_value(&value);
1693        if normalized.is_empty() || !seen.insert(normalized.clone()) {
1694            continue;
1695        }
1696
1697        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
1698        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
1699            break;
1700        }
1701
1702        total_bytes += added_bytes;
1703        lines.push(normalized);
1704    }
1705
1706    lines.join("\n")
1707}
1708
1709fn normalize_metadata_value(value: &str) -> String {
1710    value
1711        .chars()
1712        .filter(|&ch| ch != '\0')
1713        .collect::<String>()
1714        .split_whitespace()
1715        .collect::<Vec<_>>()
1716        .join(" ")
1717        .trim()
1718        .to_string()
1719}
1720
1721fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
1722    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
1723        return (String::new(), None);
1724    }
1725
1726    let mut failures = Vec::new();
1727    let mut saw_success = false;
1728
1729    let extracted = catch_unwind(AssertUnwindSafe(
1730        || -> Result<String, Box<dyn std::error::Error>> {
1731            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1732            extract_first_pdf_page_text(&mut document)
1733        },
1734    ));
1735    match extracted {
1736        Ok(Ok(text)) => {
1737            saw_success = true;
1738            if let Some(normalized) = normalize_pdf_text(text) {
1739                return (normalized, None);
1740            }
1741        }
1742        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
1743        Err(payload) => failures.push(format!(
1744            "from-bytes first-page panic: {}",
1745            panic_payload_to_string(payload.as_ref())
1746        )),
1747    }
1748
1749    let extracted = catch_unwind(AssertUnwindSafe(
1750        || -> Result<String, Box<dyn std::error::Error>> {
1751            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
1752            extract_pdf_text_from_document(&mut document)
1753        },
1754    ));
1755    match extracted {
1756        Ok(Ok(text)) => {
1757            saw_success = true;
1758            if let Some(normalized) = normalize_pdf_text(text) {
1759                return (normalized, None);
1760            }
1761        }
1762        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
1763        Err(payload) => failures.push(format!(
1764            "open full-document panic: {}",
1765            panic_payload_to_string(payload.as_ref())
1766        )),
1767    }
1768
1769    let extracted = catch_unwind(AssertUnwindSafe(
1770        || -> Result<String, Box<dyn std::error::Error>> {
1771            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
1772            extract_pdf_text_from_document(&mut document)
1773        },
1774    ));
1775    match extracted {
1776        Ok(Ok(text)) => {
1777            saw_success = true;
1778            if let Some(normalized) = normalize_pdf_text(text) {
1779                return (normalized, None);
1780            }
1781        }
1782        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
1783        Err(payload) => failures.push(format!(
1784            "from-bytes full-document panic: {}",
1785            panic_payload_to_string(payload.as_ref())
1786        )),
1787    }
1788
1789    if saw_success || is_non_actionable_pdf_failure(&failures) {
1790        (String::new(), None)
1791    } else {
1792        (
1793            String::new(),
1794            Some(format!(
1795                "PDF text extraction failed after {} attempts: {}",
1796                failures.len(),
1797                failures.join("; ")
1798            )),
1799        )
1800    }
1801}
1802
1803fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
1804    !failures.is_empty()
1805        && failures.iter().all(|failure| {
1806            failure.contains("requires a password")
1807                || failure.contains("Encrypt dictionary missing /O")
1808                || failure.contains("Encrypt dictionary missing /U")
1809                || failure.contains("security handler cannot be found")
1810                || failure.contains("Invalid cross-reference table")
1811        })
1812}
1813
1814fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
1815    if let Some(message) = payload.downcast_ref::<&str>() {
1816        (*message).to_string()
1817    } else if let Some(message) = payload.downcast_ref::<String>() {
1818        message.clone()
1819    } else {
1820        "unknown panic payload".to_string()
1821    }
1822}
1823
1824fn extract_first_pdf_page_text(
1825    document: &mut pdf_oxide::document::PdfDocument,
1826) -> Result<String, Box<dyn std::error::Error>> {
1827    if document.page_count()? == 0 {
1828        return Ok(String::new());
1829    }
1830
1831    let extracted_text = document.extract_text(0)?;
1832    let markdown_text =
1833        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
1834    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
1835        return Ok(extracted_text);
1836    }
1837
1838    let pipeline_text =
1839        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;
1840
1841    Ok(merge_pdf_first_page_text(
1842        &extracted_text,
1843        &markdown_text,
1844        &pipeline_text,
1845    ))
1846}
1847
1848fn extract_pdf_text_from_document(
1849    document: &mut pdf_oxide::document::PdfDocument,
1850) -> Result<String, Box<dyn std::error::Error>> {
1851    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
1852}
1853
1854fn normalize_pdf_text(text: String) -> Option<String> {
1855    let normalized = text.replace(['\r', '\u{0c}'], "\n");
1856    (!normalized.trim().is_empty()).then_some(normalized)
1857}
1858
1859fn merge_pdf_first_page_text(
1860    _extracted_text: &str,
1861    markdown_text: &str,
1862    pipeline_text: &str,
1863) -> String {
1864    let pipeline = pipeline_text.trim();
1865    if pipeline.is_empty() {
1866        return String::new();
1867    }
1868
1869    let prefix = pdf_first_page_heading_prefix(markdown_text);
1870    let Some(prefix) = prefix else {
1871        return pipeline_text.to_string();
1872    };
1873
1874    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
1875        pipeline_text.to_string()
1876    } else {
1877        format!("{prefix}\n\n{pipeline}")
1878    }
1879}
1880
1881fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
1882    normalize_pdf_heading_comparison_text(text)
1883        .contains(&normalize_pdf_heading_comparison_text(prefix))
1884}
1885
1886fn normalize_pdf_heading_comparison_text(text: &str) -> String {
1887    text.split_whitespace()
1888        .map(|part| part.to_ascii_lowercase())
1889        .collect::<Vec<_>>()
1890        .join(" ")
1891}
1892
1893fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
1894    let mut lines = Vec::new();
1895
1896    for line in pdf_markdown_heading_lines(markdown_text) {
1897        push_unique_line(&mut lines, line);
1898    }
1899
1900    (!lines.is_empty()).then(|| lines.join("\n"))
1901}
1902
1903fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
1904    text.lines()
1905        .map(str::trim)
1906        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
1907        .map(|line| line.trim_matches('#').trim())
1908        .filter(|line| !line.is_empty())
1909        .filter(|line| !looks_like_numbered_section_heading(line))
1910        .take(4)
1911        .map(ToOwned::to_owned)
1912        .collect()
1913}
1914
1915fn push_unique_line(lines: &mut Vec<String>, line: String) {
1916    if !lines.iter().any(|existing| existing == &line) {
1917        lines.push(line);
1918    }
1919}
1920
1921fn looks_like_numbered_section_heading(line: &str) -> bool {
1922    let mut chars = line.chars();
1923    let Some(first) = chars.next() else {
1924        return false;
1925    };
1926
1927    if !first.is_ascii_digit() {
1928        return false;
1929    }
1930
1931    matches!(chars.next(), Some('.'))
1932}
1933
1934fn is_zip_archive(bytes: &[u8]) -> bool {
1935    bytes.starts_with(b"PK\x03\x04")
1936        || bytes.starts_with(b"PK\x05\x06")
1937        || bytes.starts_with(b"PK\x07\x08")
1938}
1939
1940pub fn extract_printable_strings(bytes: &[u8]) -> String {
1941    const MIN_LEN: usize = 4;
1942    const MIN_OUTPUT_BYTES: usize = 2_000_000;
1943    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;
1944
1945    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);
1946
1947    fn is_printable_ascii(b: u8) -> bool {
1948        matches!(b, 0x20..=0x7E)
1949    }
1950
1951    let mut out = String::new();
1952    let mut run: Vec<u8> = Vec::new();
1953
1954    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
1955        if run.len() >= MIN_LEN {
1956            if !out.is_empty() {
1957                out.push('\n');
1958            }
1959            out.push_str(&String::from_utf8_lossy(run));
1960        }
1961        run.clear();
1962    };
1963
1964    for &b in bytes {
1965        if is_printable_ascii(b) {
1966            run.push(b);
1967        } else {
1968            flush_run(&mut out, &mut run);
1969            if out.len() >= max_output_bytes {
1970                return out;
1971            }
1972        }
1973    }
1974    flush_run(&mut out, &mut run);
1975    if out.len() >= max_output_bytes {
1976        return out;
1977    }
1978
1979    for start in 0..=1 {
1980        run.clear();
1981        let mut i = start;
1982        while i + 1 < bytes.len() {
1983            let b0 = bytes[i];
1984            let b1 = bytes[i + 1];
1985            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
1986            if is_printable_ascii(ch) && zero == 0 {
1987                run.push(ch);
1988            } else {
1989                flush_run(&mut out, &mut run);
1990                if out.len() >= max_output_bytes {
1991                    return out;
1992                }
1993            }
1994            i += 2;
1995        }
1996        flush_run(&mut out, &mut run);
1997        if out.len() >= max_output_bytes {
1998            return out;
1999        }
2000    }
2001
2002    out
2003}
2004
2005#[cfg(test)]
2006mod tests {
2007    use std::path::Path;
2008
2009    use super::{
2010        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, classify_file_info,
2011        extract_printable_strings, extract_text_for_detection,
2012        extract_text_for_detection_with_diagnostics, is_non_actionable_pdf_failure,
2013        normalize_mime_type, normalize_pdf_heading_comparison_text,
2014        windows_metadata_or_empty_result,
2015    };
2016
2017    #[test]
2018    fn test_extract_text_for_detection_skips_jar_archives() {
2019        let path = Path::new(
2020            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
2021        );
2022        let bytes = std::fs::read(path).expect("failed to read jar fixture");
2023
2024        let (text, kind) = extract_text_for_detection(path, &bytes);
2025
2026        assert!(text.is_empty());
2027        assert_eq!(kind, ExtractedTextKind::None);
2028    }
2029
2030    #[test]
2031    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
2032        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2033        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2034
2035        let (text, kind) = extract_text_for_detection(path, &bytes);
2036
2037        assert_eq!(kind, ExtractedTextKind::Pdf);
2038        assert!(text.contains("Redistribution and use in source and binary forms"));
2039    }
2040
2041    #[test]
2042    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
2043        let path =
2044            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2045        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2046
2047        let (text, kind) = extract_text_for_detection(path, &bytes);
2048
2049        assert_eq!(kind, ExtractedTextKind::Pdf);
2050        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
2051        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
2052    }
2053
2054    #[test]
2055    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
2056        let path =
2057            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
2058        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2059
2060        let (text, kind) = extract_text_for_detection(path, &bytes);
2061
2062        assert_eq!(kind, ExtractedTextKind::Pdf);
2063
2064        let normalized = normalize_pdf_heading_comparison_text(&text);
2065        let heading =
2066            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
2067        assert_eq!(normalized.matches(&heading).count(), 1);
2068    }
2069
2070    #[test]
2071    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
2072        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
2073        let bytes = std::fs::read(path).expect("failed to read pdf fixture");
2074
2075        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);
2076
2077        assert_eq!(kind, ExtractedTextKind::Pdf);
2078        assert!(text.contains("Redistribution and use in source and binary forms"));
2079    }
2080
2081    #[test]
2082    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
2083        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";
2084
2085        let (text, kind, scan_error) =
2086            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);
2087
2088        assert!(text.is_empty());
2089        assert_eq!(kind, ExtractedTextKind::None);
2090        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
2091        assert!(scan_error.contains("PDF text extraction failed after"));
2092    }
2093
2094    #[test]
2095    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
2096        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2097
2098        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);
2099
2100        assert!(text.is_empty());
2101        assert_eq!(kind, ExtractedTextKind::None);
2102    }
2103
2104    #[test]
2105    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
2106        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2107        let text = b"Copyright 2026 Example Project!!!";
2108        bytes[..text.len()].copy_from_slice(text);
2109        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2110        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);
2111
2112        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);
2113
2114        assert_ne!(kind, ExtractedTextKind::None);
2115        assert!(text.contains("Copyright 2026 Example Project"));
2116    }
2117
2118    #[test]
2119    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
2120        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2121        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
2122        bytes[..noise.len()].copy_from_slice(noise);
2123        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
2124        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);
2125
2126        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);
2127
2128        assert!(text.is_empty());
2129        assert_eq!(kind, ExtractedTextKind::None);
2130    }
2131
2132    #[test]
2133    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
2134        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2135        let bytes = std::fs::read(path).expect("read PE fixture");
2136
2137        let (text, kind) = extract_text_for_detection(path, &bytes);
2138
2139        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
2140        assert!(text.contains("License: This program is free software"));
2141        assert!(text.contains("LegalCopyright:"));
2142    }
2143
2144    #[test]
2145    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
2146    {
2147        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
2148        let mut bytes = std::fs::read(path).expect("read PE fixture");
2149        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);
2150
2151        let (text, kind) = extract_text_for_detection(path, &bytes);
2152
2153        assert_ne!(kind, ExtractedTextKind::None);
2154        assert!(!text.trim().is_empty());
2155    }
2156
2157    #[test]
2158    fn test_windows_metadata_or_empty_result_preserves_metadata() {
2159        let (text, kind, scan_error) =
2160            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));
2161
2162        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
2163        assert_eq!(text, "LegalCopyright: Example Corp");
2164        assert!(scan_error.is_none());
2165    }
2166
2167    #[test]
2168    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
2169        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2170        let text = b"Copyright 2026 Example Project!!!";
2171        bytes[..text.len()].copy_from_slice(text);
2172
2173        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);
2174
2175        assert!(text.is_empty());
2176        assert_eq!(kind, ExtractedTextKind::None);
2177    }
2178
2179    #[test]
2180    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
2181        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
2182        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
2183        bytes[..text.len()].copy_from_slice(text);
2184
2185        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);
2186
2187        assert_ne!(kind, ExtractedTextKind::None);
2188        assert!(text.contains("asn@redhat.com"));
2189        assert!(text.contains("https://publicsuffix.org/"));
2190    }
2191
2192    #[test]
2193    fn test_non_actionable_pdf_failures_are_suppressed() {
2194        assert!(is_non_actionable_pdf_failure(&[
2195            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
2196            "open full-document: PDF is encrypted and requires a password".to_string(),
2197        ]));
2198        assert!(is_non_actionable_pdf_failure(&[
2199            "from-bytes first-page: Invalid cross-reference table".to_string(),
2200            "open full-document: Invalid cross-reference table".to_string(),
2201        ]));
2202        assert!(is_non_actionable_pdf_failure(&[
2203            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
2204            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
2205        ]));
2206        assert!(!is_non_actionable_pdf_failure(&[
2207            "from-bytes first-page: some other parser failure".to_string(),
2208        ]));
2209    }
2210
2211    #[test]
2212    fn test_extract_text_for_detection_skips_zip_like_archives() {
2213        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";
2214
2215        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
2216        let (crate_text, crate_kind) =
2217            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);
2218
2219        assert!(whl_text.is_empty());
2220        assert_eq!(whl_kind, ExtractedTextKind::None);
2221        assert!(crate_text.is_empty());
2222        assert_eq!(crate_kind, ExtractedTextKind::None);
2223    }
2224
2225    #[test]
2226    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
2227        let path =
2228            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
2229        let bytes = std::fs::read(path).expect("failed to read lib fixture");
2230
2231        let (text, kind) = extract_text_for_detection(path, &bytes);
2232
2233        assert_ne!(kind, ExtractedTextKind::None);
2234        assert!(text.contains("Copyright nexB and others (c) 2012"));
2235    }
2236
2237    #[test]
2238    fn test_extract_text_for_detection_reads_font_metadata() {
2239        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
2240        let bytes = std::fs::read(path).expect("failed to read font fixture");
2241
2242        let (text, kind) = extract_text_for_detection(path, &bytes);
2243
2244        assert_eq!(kind, ExtractedTextKind::FontMetadata);
2245        assert!(text.contains("License Description:"), "{text}");
2246        assert!(
2247            text.contains("Open Font License") || text.contains("OFL"),
2248            "{text}"
2249        );
2250    }
2251
2252    #[test]
2253    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
2254        let bytes = b"abcd\0".repeat(525_000);
2255
2256        let text = extract_printable_strings(&bytes);
2257
2258        assert!(
2259            text.len() > 2_000_000,
2260            "unexpected truncation at {}",
2261            text.len()
2262        );
2263        assert!(text.ends_with("abcd"));
2264    }
2265
2266    #[test]
2267    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
2268        let path = Path::new(
2269            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
2270        );
2271        let bytes = std::fs::read(path).expect("failed to read svg fixture");
2272
2273        let (text, kind) = extract_text_for_detection(path, &bytes);
2274
2275        assert_eq!(kind, ExtractedTextKind::Decoded);
2276        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
2277    }
2278
2279    #[test]
2280    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
2281        let path = Path::new(
2282            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
2283        );
2284        let bytes = std::fs::read(path).expect("failed to read rtf fixture");
2285
2286        let (text, kind) = extract_text_for_detection(path, &bytes);
2287
2288        assert_eq!(kind, ExtractedTextKind::Decoded);
2289        assert!(text.contains("GNU Lesser General Public"));
2290        assert!(text.contains("version"));
2291        assert!(text.contains("2.1 of the License"));
2292    }
2293
2294    #[test]
2295    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
2296        assert_eq!(
2297            normalize_mime_type(
2298                Path::new("main.ts"),
2299                b"export const answer = 42;\n",
2300                Some("TypeScript"),
2301                "video/mp2t",
2302            ),
2303            "text/plain"
2304        );
2305    }
2306
2307    #[test]
2308    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
2309        assert_eq!(
2310            normalize_mime_type(
2311                Path::new("main.js"),
2312                b"console.log('hello');\n",
2313                Some("JavaScript"),
2314                "application/octet-stream",
2315            ),
2316            "text/plain"
2317        );
2318    }
2319
2320    #[test]
2321    fn test_normalize_mime_type_preserves_binary_video_guess() {
2322        assert_eq!(
2323            normalize_mime_type(
2324                Path::new("main.ts"),
2325                &[0, 159, 146, 150, 0, 1, 2, 3],
2326                Some("TypeScript"),
2327                "video/mp2t",
2328            ),
2329            "video/mp2t"
2330        );
2331    }
2332
2333    #[test]
2334    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
2335        assert_eq!(
2336            normalize_mime_type(
2337                Path::new("main.ts"),
2338                &[0, 159, 146, 150],
2339                Some("TypeScript"),
2340                "application/octet-stream",
2341            ),
2342            "application/octet-stream"
2343        );
2344    }
2345
2346    #[test]
2347    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
2348        let classification = classify_file_info(Path::new("test.txt"), b"");
2349
2350        assert_eq!(classification.mime_type, "inode/x-empty");
2351        assert_eq!(classification.file_type, "empty");
2352        assert!(!classification.is_binary);
2353        assert!(classification.is_text);
2354        assert!(!classification.is_source);
2355        assert_eq!(classification.programming_language, None);
2356    }
2357
2358    #[test]
2359    fn test_classify_file_info_keeps_json_out_of_programming_language() {
2360        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);
2361
2362        assert_eq!(classification.mime_type, "application/json");
2363        assert_eq!(classification.file_type, "JSON text data");
2364        assert!(classification.is_text);
2365        assert!(!classification.is_source);
2366        assert_eq!(classification.programming_language, None);
2367    }
2368
2369    #[test]
2370    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
2371        let classification =
2372            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");
2373
2374        assert_eq!(classification.mime_type, "text/plain");
2375        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2376        assert!(classification.is_text);
2377        assert!(!classification.is_binary);
2378    }
2379
2380    #[test]
2381    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
2382        let classification =
2383            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);
2384
2385        assert_eq!(classification.mime_type, "application/octet-stream");
2386        assert_eq!(classification.file_type, "data");
2387        assert!(classification.is_binary);
2388        assert!(!classification.is_text);
2389    }
2390
2391    #[test]
2392    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
2393        let classification = classify_file_info(
2394            Path::new("utf16.json"),
2395            &[
2396                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
2397            ],
2398        );
2399
2400        assert!(!classification.is_binary);
2401        assert!(classification.is_text);
2402        assert_eq!(classification.mime_type, "application/json");
2403        assert_eq!(classification.file_type, "JSON text data");
2404    }
2405
2406    #[test]
2407    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
2408        let classification = classify_file_info(Path::new("true.json"), b"true");
2409
2410        assert!(!classification.is_binary);
2411        assert!(classification.is_text);
2412        assert_eq!(classification.mime_type, "application/json");
2413        assert_eq!(classification.file_type, "JSON text data");
2414    }
2415
2416    #[test]
2417    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
2418        let classification = classify_file_info(
2419            Path::new("wrapped.json"),
2420            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
2421        );
2422
2423        assert!(!classification.is_binary);
2424        assert!(classification.is_text);
2425        assert_eq!(classification.mime_type, "text/plain");
2426        assert_eq!(classification.file_type, "text, with no line terminators");
2427    }
2428
2429    #[test]
2430    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
2431        let classification =
2432            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);
2433
2434        assert!(classification.is_binary);
2435        assert!(!classification.is_text);
2436        assert_eq!(classification.mime_type, "application/octet-stream");
2437        assert_eq!(classification.file_type, "data");
2438    }
2439
2440    #[test]
2441    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
2442        let classification = classify_file_info(
2443            Path::new("crash.json"),
2444            &[
2445                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
2446            ],
2447        );
2448
2449        assert!(classification.is_binary);
2450        assert!(!classification.is_text);
2451        assert_eq!(classification.mime_type, "application/octet-stream");
2452    }
2453
2454    #[test]
2455    fn test_classify_file_info_treats_dockerfile_as_source() {
2456        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");
2457
2458        assert_eq!(
2459            classification.programming_language.as_deref(),
2460            Some("Dockerfile")
2461        );
2462        assert!(classification.is_source);
2463        assert!(!classification.is_script);
2464        assert_eq!(
2465            classification.file_type,
2466            "Dockerfile source, UTF-8 Unicode text"
2467        );
2468    }
2469
2470    #[test]
2471    fn test_classify_file_info_treats_makefile_as_text_not_source() {
2472        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");
2473
2474        assert_eq!(classification.programming_language, None);
2475        assert!(classification.is_text);
2476        assert!(!classification.is_source);
2477        assert!(!classification.is_script);
2478        assert_eq!(classification.file_type, "UTF-8 Unicode text");
2479    }
2480
2481    #[test]
2482    fn test_classify_file_info_marks_supported_package_archives() {
2483        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";
2484
2485        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
2486        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);
2487
2488        assert!(egg.is_archive);
2489        assert_eq!(egg.mime_type, "application/zip");
2490        assert_eq!(egg.file_type, "Zip archive data");
2491        assert!(nupkg.is_archive);
2492        assert_eq!(nupkg.mime_type, "application/zip");
2493        assert_eq!(nupkg.file_type, "Zip archive data");
2494    }
2495
2496    #[test]
2497    fn test_classify_file_info_marks_png_as_binary_media() {
2498        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";
2499
2500        let classification = classify_file_info(Path::new("logo.png"), png_bytes);
2501
2502        assert_eq!(classification.mime_type, "image/png");
2503        assert_eq!(classification.file_type, "PNG image data");
2504        assert!(classification.is_binary);
2505        assert!(!classification.is_text);
2506        assert!(classification.is_media);
2507        assert!(!classification.is_archive);
2508        assert!(!classification.is_source);
2509    }
2510
2511    #[test]
2512    fn test_classify_file_info_marks_pdf_as_binary_document() {
2513        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";
2514
2515        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);
2516
2517        assert_eq!(classification.mime_type, "application/pdf");
2518        assert_eq!(classification.file_type, "PDF document");
2519        assert!(classification.is_binary);
2520        assert!(!classification.is_text);
2521        assert!(!classification.is_archive);
2522        assert!(!classification.is_media);
2523    }
2524
2525    #[test]
2526    fn test_classify_file_info_marks_binary_blobs_as_binary() {
2527        let classification =
2528            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);
2529
2530        assert!(classification.is_binary);
2531        assert!(!classification.is_text);
2532        assert!(!classification.is_source);
2533        assert_eq!(classification.programming_language, None);
2534    }
2535
2536    #[test]
2537    fn test_classify_file_info_treats_yaml_as_text_not_source() {
2538        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");
2539
2540        assert_eq!(classification.programming_language, None);
2541        assert!(classification.is_text);
2542        assert!(!classification.is_source);
2543        assert_eq!(classification.file_type, "YAML text data");
2544    }
2545
2546    #[test]
2547    fn test_classify_file_info_classifies_common_build_manifests() {
2548        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
2549        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
2550        let cmake = classify_file_info(
2551            Path::new("toolchain.cmake"),
2552            b"set(CMAKE_CXX_STANDARD 20)\n",
2553        );
2554        let gitmodules = classify_file_info(
2555            Path::new(".gitmodules"),
2556            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
2557        );
2558
2559        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
2560        assert!(gradle.is_source);
2561        assert_eq!(gradle.mime_type, "text/plain");
2562        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");
2563
2564        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
2565        assert!(flake.is_source);
2566        assert_eq!(flake.mime_type, "text/plain");
2567        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");
2568
2569        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
2570        assert!(cmake.is_source);
2571        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");
2572
2573        assert_eq!(gitmodules.programming_language, None);
2574        assert!(gitmodules.is_text);
2575        assert!(!gitmodules.is_source);
2576        assert_eq!(gitmodules.file_type, "Git configuration text");
2577    }
2578
2579    #[test]
2580    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
2581        let header = classify_file_info(
2582            Path::new("include/demo.hpp"),
2583            b"#pragma once\nclass Demo {};\n",
2584        );
2585        let ipp = classify_file_info(
2586            Path::new("include/detail/demo.ipp"),
2587            b"template <class T> void parse() {}\n",
2588        );
2589
2590        assert_eq!(header.programming_language.as_deref(), Some("C++"));
2591        assert!(header.is_source);
2592        assert!(!header.is_script);
2593        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");
2594
2595        assert_eq!(ipp.programming_language, None);
2596        assert!(!ipp.is_source);
2597        assert!(!ipp.is_script);
2598        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
2599    }
2600
2601    #[test]
2602    fn test_classify_file_info_preserves_specific_shell_family_labels() {
2603        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");
2604
2605        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
2606        assert!(bash.is_script);
2607        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
2608    }
2609
2610    #[test]
2611    fn test_classify_file_info_marks_jamfile_as_source() {
2612        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");
2613
2614        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
2615        assert!(jamfile.is_source);
2616        assert!(!jamfile.is_script);
2617        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
2618    }
2619
2620    #[test]
2621    fn test_classify_file_info_labels_javascript_shebang_scripts() {
2622        let classification = classify_file_info(
2623            Path::new("bin/run"),
2624            b"#!/usr/bin/env node\nconsole.log('hello');\n",
2625        );
2626
2627        assert_eq!(
2628            classification.programming_language.as_deref(),
2629            Some("JavaScript")
2630        );
2631        assert!(classification.is_script);
2632        assert_eq!(
2633            classification.file_type,
2634            "javascript script, UTF-8 Unicode text executable"
2635        );
2636    }
2637
2638    #[test]
2639    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
2640        let classification = classify_file_info(
2641            Path::new("script.py"),
2642            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
2643        );
2644
2645        assert_eq!(
2646            classification.programming_language.as_deref(),
2647            Some("Python")
2648        );
2649        assert!(classification.is_script);
2650        assert_eq!(classification.file_type, "python script, text executable");
2651    }
2652
2653    #[test]
2654    fn test_classify_file_info_treats_textual_tga_as_media() {
2655        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");
2656
2657        assert!(classification.is_media);
2658        assert!(classification.is_text);
2659        assert!(!classification.is_binary);
2660    }
2661
2662    #[test]
2663    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
2664        let classification =
2665            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);
2666
2667        assert!(classification.is_binary);
2668        assert!(!classification.is_text);
2669        assert!(!classification.is_source);
2670        assert_eq!(classification.programming_language, None);
2671    }
2672
2673    #[test]
2674    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
2675        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";
2676
2677        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);
2678
2679        assert!(text.is_empty());
2680        assert_eq!(kind, ExtractedTextKind::None);
2681    }
2682
2683    #[test]
2684    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
2685        let cases = [
2686            (
2687                Path::new("bin/run"),
2688                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
2689                Some("JavaScript"),
2690                true,
2691                true,
2692            ),
2693            (
2694                Path::new("Dockerfile"),
2695                b"FROM scratch\n".as_slice(),
2696                Some("Dockerfile"),
2697                true,
2698                false,
2699            ),
2700            (
2701                Path::new("package.json"),
2702                br#"{"name":"demo"}"#.as_slice(),
2703                None,
2704                false,
2705                false,
2706            ),
2707            (
2708                Path::new("config.yaml"),
2709                b"key: value\n".as_slice(),
2710                None,
2711                false,
2712                false,
2713            ),
2714            (
2715                Path::new("Makefile"),
2716                b"all:\n\techo hi\n".as_slice(),
2717                None,
2718                false,
2719                false,
2720            ),
2721        ];
2722
2723        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
2724            let classification = classify_file_info(path, bytes);
2725
2726            assert_eq!(
2727                classification.programming_language.as_deref(),
2728                expected_language,
2729                "unexpected language for {}",
2730                path.display()
2731            );
2732            assert_eq!(
2733                classification.is_source,
2734                expected_is_source,
2735                "unexpected is_source for {}",
2736                path.display()
2737            );
2738            assert_eq!(
2739                classification.is_script,
2740                expected_is_script,
2741                "unexpected is_script for {}",
2742                path.display()
2743            );
2744        }
2745    }
2746}