provenant-cli 0.1.0

// SPDX-FileCopyrightText: Provenant contributors
// SPDX-License-Identifier: Apache-2.0

use std::borrow::Cow;
use std::collections::BTreeSet;
use std::fs;
use std::io::{BufReader, Cursor, Read};
use std::panic::{AssertUnwindSafe, catch_unwind};
use std::path::Path;

use chrono::{TimeZone, Utc};
use file_format::{FileFormat, Kind as FileFormatKind};
use flate2::read::ZlibDecoder;
use glob::Pattern;
use image::{ImageDecoder, ImageFormat, ImageReader};
use mime_guess::from_path;
use object::FileKind;
use quick_xml::events::Event;
use quick_xml::reader::Reader as XmlReader;

use crate::parsers::windows_executable::extract_windows_executable_metadata_text;
use crate::utils::font::extract_font_metadata_text;
use crate::utils::language::detect_language;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExtractedTextKind {
    None,
    Decoded,
    FontMetadata,
    Pdf,
    BinaryStrings,
    ImageMetadata,
    WindowsExecutableMetadata,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FileInfoClassification {
    pub mime_type: String,
    pub file_type: String,
    pub programming_language: Option<String>,
    pub is_binary: bool,
    pub is_text: bool,
    pub is_archive: bool,
    pub is_media: bool,
    pub is_source: bool,
    pub is_script: bool,
}

const MAX_IMAGE_METADATA_VALUES: usize = 64;
const MAX_IMAGE_METADATA_TEXT_BYTES: usize = 32 * 1024;
const BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR: usize = 10;
const LARGE_OPAQUE_BINARY_SKIP_BYTES: usize = 512 * 1024;
const LARGE_MACHO_LEGAL_WINDOW_BYTES: usize = 64 * 1024;
const LARGE_MACHO_LEGAL_MAX_WINDOWS: usize = 24;
const LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER: usize = 4;
const LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES: usize = 2 * 1024 * 1024;
const JSON_VALIDATION_MAX_BYTES: usize = 4 * 1024 * 1024;
const MAX_XMP_PACKET_BYTES: usize = 256 * 1024;
const MAX_PDF_TEXT_EXTRACTION_BYTES: usize = 32 * 1024 * 1024;
const PLAIN_TEXT_EXTENSIONS: &[&str] = &[
    "rst", "rest", "md", "txt", "log", "json", "xml", "yaml", "yml", "toml", "ini",
];
const BINARY_EXTENSIONS: &[&str] = &[
    "pyc", "pyo", "pgm", "pbm", "ppm", "mp3", "mp4", "mpeg", "mpg", "emf",
];
const ARCHIVE_EXTENSIONS: &[&str] = &[
    "zip", "jar", "war", "ear", "tar", "gz", "tgz", "bz2", "xz", "7z", "rar", "apk", "deb", "rpm",
    "whl", "crate", "egg", "gem", "nupkg", "sqs", "squashfs",
];
const LARGE_MACHO_LEGAL_MARKERS: &[&[u8]] = &[
    b"Unicode, Inc.",
    b"http://www.unicode.org/copyright.html",
    b"https://www.unicode.org/copyright.html",
    b"SPDX-License-Identifier:",
    b"Licensed under",
    b"licensed under",
    b"Apache License",
    b"http://www.apache.org/licenses/",
    b"https://www.apache.org/licenses/",
    b"Permission is hereby granted",
    b"permission is hereby granted",
    b"Redistribution and use in source and binary forms",
    b"redistribution and use in source and binary forms",
    b"Permission to use, copy, modify, and/or distribute this software",
    b"The MIT License",
    b"GNU GENERAL PUBLIC LICENSE",
    b"GNU LESSER GENERAL PUBLIC LICENSE",
    b"Mozilla Public License",
];

/// Get the last modified date of a file as a `YYYY-MM-DD` string.
pub fn get_creation_date(metadata: &fs::Metadata) -> Option<String> {
    metadata.modified().ok().map(|time: std::time::SystemTime| {
        let seconds_since_epoch = time
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_secs() as i64;

        Utc.timestamp_opt(seconds_since_epoch, 0)
            .single()
            .unwrap_or_else(Utc::now)
            .format("%Y-%m-%d")
            .to_string()
    })
}

/// Check if a path should be excluded based on a list of glob patterns.
pub fn is_path_excluded(path: &Path, exclude_patterns: &[Pattern]) -> bool {
    let path_str = path.to_string_lossy();
    let file_name = path
        .file_name()
        .map(|name| name.to_string_lossy())
        .unwrap_or_default();

    for pattern in exclude_patterns {
        // Match against full path
        if pattern.matches(&path_str) {
            return true;
        }

        // Match against just the file/directory name
        if pattern.matches(&file_name) {
            return true;
        }
    }

    false
}

/// Decode a byte buffer to a String, trying UTF-16 first when the byte shape
/// strongly suggests it, then UTF-8, then Latin-1.
///
/// Latin-1 (ISO-8859-1) maps bytes 0x00-0xFF directly to Unicode U+0000-U+00FF,
/// so it can decode any byte sequence. This matches Python ScanCode's use of
/// `UnicodeDammit` which auto-detects encoding with Latin-1 as fallback.
pub fn decode_bytes_to_string(bytes: &[u8]) -> String {
    if let Some(decoded) = decode_utf16_text(bytes) {
        return decoded;
    }

    match String::from_utf8(bytes.to_vec()) {
        Ok(s) => s,
        Err(e) => {
            let bytes = e.into_bytes();
            if has_binary_control_chars(&bytes) {
                return String::new();
            }
            bytes.iter().map(|&b| b as char).collect()
        }
    }
}

pub fn extract_text_for_detection(path: &Path, bytes: &[u8]) -> (String, ExtractedTextKind) {
    let (text, kind, _) = extract_text_for_detection_with_diagnostics(path, bytes);
    (text, kind)
}

pub(crate) fn augment_license_detection_text<'a>(path: &Path, text: &'a str) -> Cow<'a, str> {
    let Some(extension) = path.extension().and_then(|ext| ext.to_str()) else {
        return Cow::Borrowed(text);
    };
    if !matches!(
        extension.to_ascii_lowercase().as_str(),
        "md" | "markdown" | "html" | "htm"
    ) {
        return Cow::Borrowed(text);
    }

    let mut hints = Vec::new();
    let has_dual_license_notice = has_dual_license_notice_text(text);
    if text.contains("CC BY 4.0") || text.contains("creativecommons.org/licenses/by/4.0") {
        hints.push("Creative Commons Attribution 4.0 International License".to_string());
    }
    if !has_dual_license_notice
        && (text.contains("Apache License (Version 2.0)")
            || text.contains("Apache License, Version 2.0"))
    {
        hints.push(
            "Licensed under the Apache License, Version 2.0. http://www.apache.org/licenses/LICENSE-2.0"
                .to_string(),
        );
    }

    if !has_dual_license_notice {
        hints.extend(extract_shields_license_badge_hints(text));
    }

    if hints.is_empty() {
        Cow::Borrowed(text)
    } else {
        let mut augmented =
            String::with_capacity(text.len() + hints.iter().map(String::len).sum::<usize>() + 8);
        augmented.push_str(text);
        augmented.push_str("\n\n");
        for (index, hint) in hints.into_iter().enumerate() {
            if index > 0 {
                augmented.push('\n');
            }
            augmented.push_str(&hint);
        }
        Cow::Owned(augmented)
    }
}

fn extract_shields_license_badge_hints(text: &str) -> Vec<String> {
    let mut hints = Vec::new();
    let mut rest = text;
    let needle = "img.shields.io/badge/license-";

    while let Some(index) = rest.find(needle) {
        let start = index + needle.len();
        let suffix = &rest[start..];
        let end = suffix
            .find([')', ']', '"', '\'', ' ', '\n'])
            .unwrap_or(suffix.len());
        let badge = &suffix[..end];
        let Some(badge) = badge.strip_suffix(".svg") else {
            rest = &suffix[end..];
            continue;
        };

        let mut segments: Vec<_> = badge
            .split('-')
            .filter(|segment| !segment.is_empty())
            .collect();
        if segments.len() < 2 {
            rest = &suffix[end..];
            continue;
        }
        segments.pop();
        let candidate = segments.join("-").replace("%20", " ").replace('_', "-");
        if !candidate.is_empty() {
            hints.push(canonical_shields_license_hint(&candidate));
        }

        rest = &suffix[end..];
    }

    hints.sort();
    hints.dedup();
    hints
}

fn has_dual_license_notice_text(text: &str) -> bool {
    let lower = text.to_ascii_lowercase();
    (lower.contains("licensed under either of") && lower.contains("at your option"))
        || lower.contains("dual-licensed under")
        || lower.contains("dual licensed under")
}

fn canonical_shields_license_hint(candidate: &str) -> String {
    match candidate.trim() {
        "MIT" => "The MIT License".to_string(),
        "Apache-2.0" | "Apache 2.0" => "Apache License 2.0".to_string(),
        other => format!("{other} License"),
    }
}

pub(crate) fn extract_text_for_detection_with_diagnostics(
    path: &Path,
    bytes: &[u8],
) -> (String, ExtractedTextKind, Option<String>) {
    let ext = path
        .extension()
        .and_then(|e| e.to_str())
        .map(|s| s.to_ascii_lowercase());
    let detected_format = detect_file_format(bytes);

    if looks_like_rtf(bytes, ext.as_deref()) {
        let text = extract_rtf_text(bytes);
        return if text.trim().is_empty() {
            (String::new(), ExtractedTextKind::None, None)
        } else {
            (text, ExtractedTextKind::Decoded, None)
        };
    }

    if looks_like_pdf(bytes) || detected_format.short_name() == Some("PDF") {
        let (text, scan_error) = extract_pdf_text(path, bytes);
        return if text.is_empty() {
            (String::new(), ExtractedTextKind::None, scan_error)
        } else {
            (text, ExtractedTextKind::Pdf, None)
        };
    }

    if let Some(format) = supported_image_metadata_format(ext.as_deref(), detected_format) {
        let text = extract_image_metadata_text(bytes, format);
        return if text.is_empty() {
            if is_supported_image_container(bytes, format) {
                (String::new(), ExtractedTextKind::None, None)
            } else {
                let decoded = decode_bytes_to_string(bytes);
                if decoded.is_empty() {
                    (String::new(), ExtractedTextKind::None, None)
                } else {
                    (decoded, ExtractedTextKind::Decoded, None)
                }
            }
        } else {
            (text, ExtractedTextKind::ImageMetadata, None)
        };
    }

    if let Some(text) = extract_font_metadata_text(path, bytes) {
        let strings = extract_printable_strings(bytes);
        let combined = if strings.is_empty() {
            text
        } else {
            combine_extracted_text_fragments(Some(text), strings)
        };
        return (combined, ExtractedTextKind::FontMetadata, None);
    }

    let windows_executable_metadata_text = extract_windows_executable_metadata_text(bytes);
    let large_opaque_binary = windows_executable_metadata_text.is_none()
        && is_large_opaque_binary_candidate(bytes, detected_format);
    let bounded_macho_legal_text = if large_opaque_binary {
        extract_bounded_macho_legal_strings(bytes)
    } else {
        String::new()
    };
    let skip_large_opaque_binary_text =
        should_skip_large_opaque_binary_text_extraction(path, bytes, detected_format);

    if skip_large_opaque_binary_text {
        if !bounded_macho_legal_text.is_empty() {
            return (
                combine_extracted_text_fragments(
                    windows_executable_metadata_text,
                    bounded_macho_legal_text,
                ),
                ExtractedTextKind::BinaryStrings,
                None,
            );
        }
        return windows_metadata_or_empty_result(windows_executable_metadata_text);
    }

    if should_skip_binary_string_extraction(path, bytes, detected_format) {
        return (String::new(), ExtractedTextKind::None, None);
    }

    let is_svg_text = lower_extension(path).as_deref() == Some("svg")
        || detected_format.media_type() == "image/svg+xml";
    let should_try_decoded_text = looks_like_textual_bytes(bytes) || is_svg_text;
    let decoded_is_utf8 = std::str::from_utf8(bytes).is_ok();
    let path_suggests_text = ext.as_deref().is_some_and(|extension| {
        PLAIN_TEXT_EXTENSIONS.contains(&extension) || detect_language(path, bytes).is_some()
    });

    if !large_opaque_binary && should_try_decoded_text {
        let decoded = decode_bytes_to_string(bytes);
        if !decoded.is_empty()
            && (is_svg_text
                || decoded_is_utf8
                || path_suggests_text
                || looks_like_decoded_text(&decoded))
        {
            let combined =
                combine_extracted_text_fragments(windows_executable_metadata_text, decoded);
            return (combined, ExtractedTextKind::Decoded, None);
        }
    }

    let text = if large_opaque_binary {
        let sampled_text = extract_sampled_printable_strings(bytes);
        if bounded_macho_legal_text.is_empty() {
            sampled_text
        } else {
            combine_extracted_text_fragments(Some(sampled_text), bounded_macho_legal_text)
        }
    } else {
        extract_printable_strings(bytes)
    };
    if text.is_empty() {
        windows_metadata_or_empty_result(windows_executable_metadata_text)
    } else {
        (
            combine_extracted_text_fragments(windows_executable_metadata_text, text),
            ExtractedTextKind::BinaryStrings,
            None,
        )
    }
}

fn combine_extracted_text_fragments(prefix: Option<String>, suffix: String) -> String {
    match prefix {
        Some(prefix) if !prefix.is_empty() && !suffix.is_empty() => format!("{prefix}\n{suffix}"),
        Some(prefix) if !prefix.is_empty() => prefix,
        _ => suffix,
    }
}

fn windows_metadata_or_empty_result(
    windows_executable_metadata_text: Option<String>,
) -> (String, ExtractedTextKind, Option<String>) {
    if let Some(metadata_text) = windows_executable_metadata_text {
        (
            metadata_text,
            ExtractedTextKind::WindowsExecutableMetadata,
            None,
        )
    } else {
        (String::new(), ExtractedTextKind::None, None)
    }
}

pub fn classify_file_info(path: &Path, bytes: &[u8]) -> FileInfoClassification {
    let detected_format = detect_file_format(bytes);
    let detected_language = detect_language(path, bytes);
    let is_binary = detect_is_binary(path, bytes, detected_format, detected_language.as_deref());
    let is_text = !is_binary;
    let mime_type = detect_mime_type(path, bytes, detected_format, detected_language.as_deref());
    let is_archive = detect_is_archive(path, bytes, &mime_type, is_text, detected_format);
    let is_media = detect_is_media(path, bytes, &mime_type, detected_format);
    let is_script = detect_is_script(path, bytes, detected_language.as_deref(), is_text);
    let is_source = detect_is_source(path, detected_language.as_deref(), is_text, is_script);
    let programming_language = is_source.then(|| detected_language.clone()).flatten();
    let file_type = detect_file_type(
        path,
        bytes,
        detected_format,
        &mime_type,
        programming_language.as_deref(),
        is_binary,
        is_text,
        is_archive,
        is_media,
        is_script,
    );

    FileInfoClassification {
        mime_type,
        file_type,
        programming_language,
        is_binary,
        is_text,
        is_archive,
        is_media,
        is_source,
        is_script,
    }
}

fn detect_file_format(bytes: &[u8]) -> FileFormat {
    FileFormat::from_reader(Cursor::new(bytes)).unwrap_or(FileFormat::ArbitraryBinaryData)
}

const CORRUPTED_UTF16_BOM_PREFIX: &[u8] = &[0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD];

fn is_utf8_text(bytes: &[u8]) -> bool {
    std::str::from_utf8(bytes).is_ok()
}

fn strip_corrupted_utf16_bom_prefix(bytes: &[u8]) -> &[u8] {
    bytes
        .strip_prefix(CORRUPTED_UTF16_BOM_PREFIX)
        .unwrap_or(bytes)
}

fn decode_utf16_units(bytes: &[u8], is_le: bool, require_text_shape: bool) -> Option<String> {
    if bytes.is_empty() || !bytes.len().is_multiple_of(2) {
        return None;
    }

    let code_units: Vec<u16> = bytes
        .chunks_exact(2)
        .map(|chunk| {
            if is_le {
                u16::from_le_bytes([chunk[0], chunk[1]])
            } else {
                u16::from_be_bytes([chunk[0], chunk[1]])
            }
        })
        .collect();

    let decoded = std::char::decode_utf16(code_units)
        .collect::<Result<String, _>>()
        .ok()?;

    if !require_text_shape {
        return (!decoded.contains('\0')).then_some(decoded);
    }

    if !looks_like_decoded_text(&decoded) {
        return None;
    }

    Some(decoded)
}

fn looks_like_decoded_text(decoded: &str) -> bool {
    if decoded
        .chars()
        .any(|ch| ch.is_control() && !matches!(ch, '\n' | '\r' | '\t'))
    {
        return false;
    }

    let visible = decoded
        .chars()
        .filter(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'))
        .count();
    if visible < 3 || decoded.contains('\0') {
        return false;
    }

    let alpha = decoded.chars().filter(|ch| ch.is_alphabetic()).count();
    let punctuation = decoded
        .chars()
        .filter(|ch| {
            matches!(
                ch,
                '{' | '}'
                    | '['
                    | ']'
                    | '<'
                    | '>'
                    | '('
                    | ')'
                    | ':'
                    | ';'
                    | ','
                    | '"'
                    | '\''
                    | '/'
                    | '='
                    | '-'
                    | '_'
                    | '#'
                    | '!'
            )
        })
        .count();
    let whitespace = decoded.chars().filter(|ch| ch.is_whitespace()).count();

    let textish = alpha + punctuation + whitespace;
    textish + (visible / 5) >= visible && (alpha > 0 || punctuation >= 2)
}

fn detect_utf16_endianness(bytes: &[u8]) -> Option<bool> {
    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
    if stripped.len() < 4 || !stripped.len().is_multiple_of(2) {
        return None;
    }

    let pair_count = stripped.len() / 2;
    let even_zero = stripped.iter().step_by(2).filter(|&&b| b == 0).count();
    let odd_zero = stripped
        .iter()
        .skip(1)
        .step_by(2)
        .filter(|&&b| b == 0)
        .count();

    let looks_like_be = even_zero * 3 >= pair_count && odd_zero * 6 <= pair_count;
    let looks_like_le = odd_zero * 3 >= pair_count && even_zero * 6 <= pair_count;

    match (looks_like_le, looks_like_be) {
        (true, false) => Some(true),
        (false, true) => Some(false),
        (true, true) => Some(true),
        (false, false) => None,
    }
}

fn decode_utf16_text(bytes: &[u8]) -> Option<String> {
    if let Some(decoded) = decode_utf16_bom_text(bytes) {
        return Some(decoded);
    }

    let stripped = strip_corrupted_utf16_bom_prefix(bytes);
    match detect_utf16_endianness(bytes) {
        Some(true) => decode_utf16_units(stripped, true, true),
        Some(false) => decode_utf16_units(stripped, false, true),
        None => None,
    }
}

fn decode_utf16_json_text(bytes: &[u8]) -> Option<String> {
    if bytes.len() >= 2 {
        let (is_le, body) = match bytes {
            [0xFF, 0xFE, rest @ ..] => (true, rest),
            [0xFE, 0xFF, rest @ ..] => (false, rest),
            _ => {
                let stripped = strip_corrupted_utf16_bom_prefix(bytes);
                return match detect_utf16_endianness(bytes) {
                    Some(true) => decode_utf16_units(stripped, true, false),
                    Some(false) => decode_utf16_units(stripped, false, false),
                    None => None,
                };
            }
        };

        if body.is_empty() || !body.len().is_multiple_of(2) {
            return None;
        }

        return decode_utf16_units(body, is_le, false);
    }

    None
}

fn decode_utf16_bom_text(bytes: &[u8]) -> Option<String> {
    if bytes.len() < 2 || !bytes.len().is_multiple_of(2) {
        return None;
    }

    let (is_le, body) = match bytes {
        [0xFF, 0xFE, rest @ ..] => (true, rest),
        [0xFE, 0xFF, rest @ ..] => (false, rest),
        _ => return None,
    };

    if body.is_empty() || body.len() % 2 != 0 {
        return None;
    }

    decode_utf16_units(body, is_le, true)
}

fn has_binary_control_chars(bytes: &[u8]) -> bool {
    let control_count = bytes
        .iter()
        .filter(|&&b| b < 0x09 || (b > 0x0D && b < 0x20))
        .count();
    control_count > bytes.len() / BINARY_CONTROL_CHAR_THRESHOLD_DIVISOR
}

fn has_decodable_text(bytes: &[u8]) -> bool {
    bytes.is_empty()
        || is_utf8_text(bytes)
        || decode_utf16_text(bytes).is_some()
        || !has_binary_control_chars(bytes)
}

fn looks_like_textual_bytes(bytes: &[u8]) -> bool {
    if bytes.is_empty() || is_utf8_text(bytes) {
        return true;
    }
    if let Some(decoded) = decode_utf16_text(bytes) {
        return decoded
            .chars()
            .any(|ch| !ch.is_control() || matches!(ch, '\n' | '\r' | '\t'));
    }

    let printable_count = bytes
        .iter()
        .filter(|&&b| matches!(b, b'\n' | b'\r' | b'\t') || (0x20..=0x7e).contains(&b))
        .count();
    printable_count * 2 >= bytes.len()
}

fn is_textual_media_type(media_type: &str) -> bool {
    media_type.starts_with("text/")
        || matches!(
            media_type,
            "application/json" | "application/xml" | "text/xml"
        )
        || media_type.ends_with("+json")
        || media_type.ends_with("+xml")
}

fn is_textual_format(detected_format: FileFormat) -> bool {
    matches!(detected_format, FileFormat::Empty | FileFormat::PlainText)
        || is_textual_media_type(detected_format.media_type())
}

fn is_known_binary_format(detected_format: FileFormat) -> bool {
    !matches!(detected_format, FileFormat::ArbitraryBinaryData)
        && !is_textual_format(detected_format)
}

pub fn detect_mime_type(
    path: &Path,
    bytes: &[u8],
    detected_format: FileFormat,
    programming_language: Option<&str>,
) -> String {
    if bytes.is_empty() {
        return "inode/x-empty".to_string();
    }

    if lower_extension(path).as_deref() == Some("json") {
        if let Some(is_binary) = json_binary_override(bytes) {
            if is_binary {
                return "application/octet-stream".to_string();
            }
            if has_valid_json_text(bytes) {
                return "application/json".to_string();
            }
            return "text/plain".to_string();
        }
        if has_valid_json_text(bytes) {
            return "application/json".to_string();
        }
        if has_decodable_text(bytes) && looks_like_textual_bytes(bytes) {
            return "text/plain".to_string();
        }
        return "application/octet-stream".to_string();
    }

    if is_zip_archive(bytes) {
        return detect_zip_like_mime(path);
    }

    if looks_like_deb(bytes, path) {
        return "application/vnd.debian.binary-package".to_string();
    }

    if looks_like_rpm(bytes, path) {
        return "application/x-rpm".to_string();
    }

    let guessed_mime = from_path(path)
        .first_or_octet_stream()
        .essence_str()
        .to_string();

    let mime_type = match detected_format {
        FileFormat::Empty => "inode/x-empty".to_string(),
        FileFormat::PlainText => {
            if guessed_mime == "application/octet-stream" || guessed_mime.starts_with("video/") {
                "text/plain".to_string()
            } else {
                guessed_mime.clone()
            }
        }
        _ => {
            let detected_mime = detected_format.media_type();
            if detected_mime == "application/octet-stream"
                && guessed_mime != "application/octet-stream"
            {
                guessed_mime.clone()
            } else {
                detected_mime.to_string()
            }
        }
    };

    normalize_mime_type(path, bytes, programming_language, &mime_type)
}

fn normalize_mime_type(
    path: &Path,
    bytes: &[u8],
    programming_language: Option<&str>,
    mime_type: &str,
) -> String {
    if should_prefer_text_mime(path, bytes, programming_language, mime_type) {
        return "text/plain".to_string();
    }

    mime_type.to_string()
}

fn should_prefer_text_mime(
    path: &Path,
    bytes: &[u8],
    programming_language: Option<&str>,
    mime_type: &str,
) -> bool {
    has_decodable_text(bytes)
        && looks_like_textual_bytes(bytes)
        && is_textual_source_candidate(path, programming_language)
        && (mime_type.starts_with("video/") || mime_type == "application/octet-stream")
}

fn has_valid_json_text(bytes: &[u8]) -> bool {
    if bytes.len() > JSON_VALIDATION_MAX_BYTES {
        return false;
    }

    serde_json::from_slice::<serde_json::Value>(bytes).is_ok()
        || decode_utf16_json_text(bytes)
            .and_then(|text| serde_json::from_str::<serde_json::Value>(&text).ok())
            .is_some()
}

fn is_wrapped_invalid_json_string_text(bytes: &[u8]) -> bool {
    !bytes.contains(&0)
        && !bytes.contains(&0xFF)
        && bytes.starts_with(b"[\"")
        && bytes.ends_with(b"\"]")
        && bytes.len() >= 8
}

fn json_binary_override(bytes: &[u8]) -> Option<bool> {
    if has_valid_json_text(bytes) {
        return Some(false);
    }

    if bytes.contains(&0) {
        return Some(true);
    }

    if bytes.contains(&0xFF) && (bytes.len() <= 5 || bytes.len() > 1024) {
        return Some(true);
    }

    if is_wrapped_invalid_json_string_text(bytes) {
        return Some(false);
    }

    None
}

fn detect_is_binary(
    path: &Path,
    bytes: &[u8],
    detected_format: FileFormat,
    programming_language: Option<&str>,
) -> bool {
    if lower_extension(path).as_deref() == Some("json")
        && let Some(is_binary) = json_binary_override(bytes)
    {
        return is_binary;
    }

    if is_textual_format(detected_format) {
        return false;
    }

    if lower_extension(path)
        .as_deref()
        .is_some_and(|ext| BINARY_EXTENSIONS.contains(&ext))
    {
        return true;
    }

    if should_treat_binary_bytes_as_text(path, bytes, programming_language) {
        return false;
    }

    has_binary_control_chars(bytes)
        || is_known_binary_format(detected_format)
        || (matches!(detected_format, FileFormat::ArbitraryBinaryData)
            && !looks_like_textual_bytes(bytes))
}

fn should_treat_binary_bytes_as_text(
    path: &Path,
    bytes: &[u8],
    programming_language: Option<&str>,
) -> bool {
    has_decodable_text(bytes)
        && looks_like_textual_bytes(bytes)
        && (bytes.starts_with(b"#!") || is_textual_source_candidate(path, programming_language))
}

fn detect_is_archive(
    path: &Path,
    bytes: &[u8],
    mime_type: &str,
    is_text: bool,
    detected_format: FileFormat,
) -> bool {
    if is_text {
        return false;
    }

    lower_extension(path)
        .as_deref()
        .is_some_and(|ext| ARCHIVE_EXTENSIONS.contains(&ext))
        || matches!(
            detected_format.kind(),
            FileFormatKind::Archive | FileFormatKind::Compressed | FileFormatKind::Package
        )
        || is_zip_archive(bytes)
        || looks_like_gzip(bytes)
        || looks_like_bzip2(bytes)
        || looks_like_xz(bytes)
        || looks_like_deb(bytes, path)
        || looks_like_rpm(bytes, path)
        || looks_like_squashfs(bytes, path)
        || mime_type.contains("zip")
        || mime_type.contains("compressed")
        || mime_type.contains("tar")
        || mime_type.contains("x-rpm")
        || mime_type.contains("debian")
}

fn detect_is_media(
    path: &Path,
    bytes: &[u8],
    mime_type: &str,
    detected_format: FileFormat,
) -> bool {
    media_mime_from_content(bytes).is_some()
        || matches!(
            detected_format.kind(),
            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
        )
        || mime_type.starts_with("image/")
        || mime_type.starts_with("audio/")
        || mime_type.starts_with("video/")
        || (mime_type == "application/octet-stream"
            && lower_extension(path).as_deref() == Some("tga")
            && !has_binary_control_chars(bytes))
}

fn detect_is_script(
    path: &Path,
    bytes: &[u8],
    programming_language: Option<&str>,
    is_text: bool,
) -> bool {
    if !is_text || is_makefile(path) {
        return false;
    }

    bytes.starts_with(b"#!")
        || lower_extension(path).as_deref().is_some_and(|ext| {
            matches!(
                ext,
                "sh" | "bash" | "zsh" | "fish" | "ksh" | "ps1" | "psm1" | "psd1" | "awk"
            )
        })
        || matches!(
            programming_language,
            Some(
                "Shell"
                    | "Bash"
                    | "Zsh"
                    | "Fish"
                    | "Ksh"
                    | "Python"
                    | "Ruby"
                    | "Perl"
                    | "PHP"
                    | "PowerShell"
                    | "Awk"
            )
        )
}

fn detect_is_source(
    path: &Path,
    programming_language: Option<&str>,
    is_text: bool,
    is_script: bool,
) -> bool {
    if !is_text || is_plain_text(path) || is_makefile(path) || is_source_map(path) {
        return false;
    }

    if is_c_like_source(path) || is_java_like_source(path) {
        return true;
    }

    programming_language.is_some() || is_script
}

#[allow(clippy::too_many_arguments)]
fn detect_file_type(
    path: &Path,
    bytes: &[u8],
    detected_format: FileFormat,
    mime_type: &str,
    programming_language: Option<&str>,
    is_binary: bool,
    is_text: bool,
    is_archive: bool,
    is_media: bool,
    is_script: bool,
) -> String {
    if bytes.is_empty() {
        return "empty".to_string();
    }

    if looks_like_pdf(bytes) {
        return "PDF document".to_string();
    }

    if let Some(file_type) = media_file_type_from_content(bytes) {
        return file_type.to_string();
    }

    if is_archive {
        return archive_file_type(path, bytes, detected_format);
    }

    if is_script {
        return script_file_type(programming_language, bytes);
    }

    if is_text {
        if lower_extension(path).as_deref() == Some("json") {
            if has_valid_json_text(bytes) {
                return "JSON text data".to_string();
            }
            return text_file_type(bytes);
        }
        if lower_extension(path).as_deref() == Some("xml") {
            return "XML text data".to_string();
        }
        if matches!(lower_extension(path).as_deref(), Some("yaml" | "yml")) {
            return "YAML text data".to_string();
        }
        if lower_extension(path).as_deref() == Some("toml") {
            return "TOML text data".to_string();
        }
        if matches!(
            lower_extension(path).as_deref(),
            Some("ini" | "cfg" | "conf")
        ) {
            return "INI text data".to_string();
        }
        if matches!(lower_file_name(path).as_str(), ".gitmodules" | ".gitconfig") {
            return "Git configuration text".to_string();
        }
        if matches!(lower_extension(path).as_deref(), Some("md" | "markdown")) {
            return text_file_type(bytes);
        }
        if programming_language.is_some() && !is_media {
            return source_file_type(programming_language, bytes);
        }
        return text_file_type(bytes);
    }

    if let Some(file_type) = format_based_file_type(detected_format) {
        return file_type;
    }

    if is_binary && mime_type == "application/octet-stream" {
        return "data".to_string();
    }

    mime_type.to_string()
}

fn is_textual_source_candidate(path: &Path, programming_language: Option<&str>) -> bool {
    if matches!(programming_language, Some(language) if is_source_like_language(language)) {
        return true;
    }

    if matches!(
        lower_file_name(path).as_str(),
        "dockerfile"
            | "containerfile"
            | "containerfile.core"
            | "apkbuild"
            | "podfile"
            | "jamfile"
            | "jamroot"
            | "meson.build"
            | "build"
            | "workspace"
            | "buck"
            | "default.nix"
            | "flake.nix"
            | "shell.nix"
    ) {
        return true;
    }

    path.extension()
        .and_then(|ext| ext.to_str())
        .is_some_and(|ext| {
            matches!(
                ext.to_ascii_lowercase().as_str(),
                "rs" | "py"
                    | "js"
                    | "mjs"
                    | "cjs"
                    | "jsx"
                    | "ts"
                    | "mts"
                    | "cts"
                    | "tsx"
                    | "c"
                    | "cpp"
                    | "cc"
                    | "cxx"
                    | "h"
                    | "hpp"
                    | "m"
                    | "mm"
                    | "s"
                    | "asm"
                    | "java"
                    | "go"
                    | "rb"
                    | "php"
                    | "pl"
                    | "swift"
                    | "sh"
                    | "bash"
                    | "zsh"
                    | "fish"
                    | "ksh"
                    | "ps1"
                    | "psm1"
                    | "psd1"
                    | "awk"
                    | "kt"
                    | "kts"
                    | "dart"
                    | "scala"
                    | "groovy"
                    | "gradle"
                    | "gvy"
                    | "gy"
                    | "gsh"
                    | "cs"
                    | "fs"
                    | "fsx"
                    | "r"
                    | "lua"
                    | "jl"
                    | "ex"
                    | "exs"
                    | "clj"
                    | "cljs"
                    | "cljc"
                    | "hs"
                    | "erl"
                    | "nix"
                    | "zig"
                    | "bzl"
                    | "bazel"
                    | "star"
                    | "sky"
                    | "ml"
                    | "mli"
                    | "tex"
            )
        })
}

fn is_source_like_language(language: &str) -> bool {
    matches!(
        language,
        "Rust"
            | "Python"
            | "JavaScript"
            | "TypeScript"
            | "JavaScript/TypeScript"
            | "C"
            | "C++"
            | "Objective-C"
            | "Objective-C++"
            | "GAS"
            | "Java"
            | "Go"
            | "Ruby"
            | "PHP"
            | "Perl"
            | "Swift"
            | "Shell"
            | "PowerShell"
            | "Awk"
            | "Kotlin"
            | "Dart"
            | "Scala"
            | "C#"
            | "F#"
            | "R"
            | "Lua"
            | "Julia"
            | "Elixir"
            | "Clojure"
            | "Haskell"
            | "Erlang"
            | "Groovy"
            | "Nix"
            | "Zig"
            | "Starlark"
            | "OCaml"
            | "Meson"
            | "TeX"
            | "Dockerfile"
            | "Makefile"
            | "Jamfile"
    )
}

fn extension(path: &Path) -> Option<&str> {
    path.extension().and_then(|ext| ext.to_str())
}

fn lower_extension(path: &Path) -> Option<String> {
    extension(path).map(|ext| ext.to_ascii_lowercase())
}

fn lower_file_name(path: &Path) -> String {
    path.file_name()
        .and_then(|name| name.to_str())
        .map(|name| name.to_ascii_lowercase())
        .unwrap_or_default()
}

fn is_plain_text(path: &Path) -> bool {
    lower_extension(path)
        .as_deref()
        .is_some_and(|ext| PLAIN_TEXT_EXTENSIONS.contains(&ext))
}

fn is_makefile(path: &Path) -> bool {
    matches!(lower_file_name(path).as_str(), "makefile" | "makefile.inc")
}

fn is_source_map(path: &Path) -> bool {
    let path_lower = path.to_string_lossy().to_ascii_lowercase();
    path_lower.ends_with(".js.map") || path_lower.ends_with(".css.map")
}

fn is_c_like_source(path: &Path) -> bool {
    lower_extension(path).as_deref().is_some_and(|ext| {
        matches!(
            ext,
            "c" | "cc"
                | "cp"
                | "cpp"
                | "cxx"
                | "c++"
                | "h"
                | "hh"
                | "hpp"
                | "hxx"
                | "h++"
                | "i"
                | "ii"
                | "m"
                | "s"
                | "asm"
        )
    })
}

fn is_java_like_source(path: &Path) -> bool {
    lower_extension(path)
        .as_deref()
        .is_some_and(|ext| matches!(ext, "java" | "aj" | "jad" | "ajt"))
}

fn format_based_file_type(detected_format: FileFormat) -> Option<String> {
    match detected_format {
        FileFormat::ArbitraryBinaryData | FileFormat::Empty | FileFormat::PlainText => None,
        format if format.short_name() == Some("PDF") => Some("PDF document".to_string()),
        format => Some(match format.kind() {
            FileFormatKind::Image => short_name_or_name(&format, "image data"),
            FileFormatKind::Audio => short_name_or_name(&format, "audio data"),
            FileFormatKind::Video => short_name_or_name(&format, "video data"),
            _ => format.name().to_string(),
        }),
    }
}

fn short_name_or_name(format: &FileFormat, suffix: &str) -> String {
    format
        .short_name()
        .map(|short_name| format!("{short_name} {suffix}"))
        .unwrap_or_else(|| format!("{} {suffix}", format.name()))
}

fn detect_zip_like_mime(path: &Path) -> String {
    match extension(path).map(|ext| ext.to_ascii_lowercase()) {
        Some(ext) if ext == "apk" => "application/vnd.android.package-archive".to_string(),
        Some(ext) if matches!(ext.as_str(), "jar" | "war" | "ear") => {
            "application/java-archive".to_string()
        }
        _ => "application/zip".to_string(),
    }
}

fn media_mime_from_content(bytes: &[u8]) -> Option<&'static str> {
    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
        Some("image/png")
    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
        Some("image/jpeg")
    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
        Some("image/tiff")
    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
        Some("image/webp")
    } else {
        None
    }
}

fn media_file_type_from_content(bytes: &[u8]) -> Option<&'static str> {
    if bytes.starts_with(b"\x89PNG\r\n\x1a\n") {
        Some("PNG image data")
    } else if bytes.starts_with(&[0xff, 0xd8, 0xff]) {
        Some("JPEG image data")
    } else if bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a") {
        Some("TIFF image data")
    } else if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
        Some("WebP image data")
    } else {
        None
    }
}

fn looks_like_pdf(bytes: &[u8]) -> bool {
    bytes.starts_with(b"%PDF-")
}

fn looks_like_rtf(bytes: &[u8], ext: Option<&str>) -> bool {
    ext == Some("rtf") || bytes.starts_with(b"{\\rtf")
}

fn extract_rtf_text(bytes: &[u8]) -> String {
    let text = String::from_utf8_lossy(bytes);
    let chars: Vec<char> = text.chars().collect();
    let mut output = String::new();
    let mut index = 0usize;

    while index < chars.len() {
        match chars[index] {
            '{' | '}' => {
                index += 1;
            }
            '\\' => {
                index += 1;
                if index >= chars.len() {
                    break;
                }

                match chars[index] {
                    '\\' | '{' | '}' => {
                        output.push(chars[index]);
                        index += 1;
                    }
                    '\'' => {
                        if index + 2 < chars.len() {
                            let hex = [chars[index + 1], chars[index + 2]];
                            let hex: String = hex.iter().collect();
                            if let Ok(value) = u8::from_str_radix(&hex, 16) {
                                output.push(value as char);
                                index += 3;
                                continue;
                            }
                        }
                        index += 1;
                    }
                    control if control.is_ascii_alphabetic() => {
                        let start = index;
                        while index < chars.len() && chars[index].is_ascii_alphabetic() {
                            index += 1;
                        }
                        let control_word: String = chars[start..index].iter().collect();

                        let number_start = index;
                        if index < chars.len()
                            && (chars[index] == '-' || chars[index].is_ascii_digit())
                        {
                            index += 1;
                            while index < chars.len() && chars[index].is_ascii_digit() {
                                index += 1;
                            }
                        }
                        let parameter: String = chars[number_start..index].iter().collect();

                        if index < chars.len() && chars[index] == ' ' {
                            index += 1;
                        }

                        match control_word.as_str() {
                            "par" | "line" => output.push('\n'),
                            "tab" => output.push('\t'),
                            "emdash" => output.push('—'),
                            "endash" => output.push('–'),
                            "bullet" => output.push('•'),
                            "lquote" | "rquote" => output.push('\''),
                            "ldblquote" | "rdblquote" => output.push('"'),
                            "u" => {
                                if let Ok(codepoint) = parameter.parse::<i32>() {
                                    let normalized = if codepoint < 0 {
                                        codepoint + 65_536
                                    } else {
                                        codepoint
                                    };
                                    if let Ok(normalized) = u32::try_from(normalized)
                                        && let Some(ch) = char::from_u32(normalized)
                                    {
                                        output.push(ch);
                                    }
                                }

                                if index < chars.len()
                                    && !matches!(chars[index], '\\' | '{' | '}' | '\n' | '\r')
                                {
                                    index += 1;
                                }
                            }
                            _ => {}
                        }
                    }
                    _ => {
                        index += 1;
                    }
                }
            }
            ch => {
                output.push(ch);
                index += 1;
            }
        }
    }

    output
        .replace(['\r', '\u{0c}'], "\n")
        .lines()
        .map(str::trim_end)
        .collect::<Vec<_>>()
        .join("\n")
}

fn looks_like_gzip(bytes: &[u8]) -> bool {
    bytes.starts_with(&[0x1f, 0x8b])
}

fn looks_like_bzip2(bytes: &[u8]) -> bool {
    bytes.starts_with(b"BZh")
}

fn looks_like_xz(bytes: &[u8]) -> bool {
    bytes.starts_with(&[0xfd, b'7', b'z', b'X', b'Z', 0x00])
}

fn looks_like_deb(bytes: &[u8], path: &Path) -> bool {
    lower_extension(path).as_deref() == Some("deb") && bytes.starts_with(b"!<arch>\n")
}

fn looks_like_rpm(bytes: &[u8], path: &Path) -> bool {
    lower_extension(path).as_deref() == Some("rpm") && bytes.starts_with(&[0xed, 0xab, 0xee, 0xdb])
}

fn looks_like_squashfs(bytes: &[u8], path: &Path) -> bool {
    lower_extension(path)
        .as_deref()
        .is_some_and(|ext| matches!(ext, "sqs" | "squashfs"))
        && (bytes.starts_with(&[0x68, 0x73, 0x71, 0x73])
            || bytes.starts_with(&[0x73, 0x71, 0x73, 0x68]))
}

fn archive_file_type(path: &Path, bytes: &[u8], detected_format: FileFormat) -> String {
    if looks_like_deb(bytes, path) {
        "debian binary package (format 2.0)".to_string()
    } else if looks_like_rpm(bytes, path) {
        "RPM package".to_string()
    } else if looks_like_squashfs(bytes, path) {
        "Squashfs filesystem".to_string()
    } else if looks_like_gzip(bytes) {
        "gzip compressed data".to_string()
    } else if looks_like_bzip2(bytes) {
        "bzip2 compressed data".to_string()
    } else if looks_like_xz(bytes) {
        "XZ compressed data".to_string()
    } else if is_zip_archive(bytes) {
        "Zip archive data".to_string()
    } else if lower_extension(path).as_deref() == Some("gem") {
        "POSIX tar archive".to_string()
    } else if let Some(file_type) = format_based_file_type(detected_format) {
        file_type
    } else {
        "archive data".to_string()
    }
}

fn script_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
    let suffix = text_executable_label(bytes);

    match programming_language {
        Some("Python") => format!("python script, {suffix}"),
        Some("Ruby") => format!("ruby script, {suffix}"),
        Some("Perl") => format!("perl script, {suffix}"),
        Some("PHP") => format!("php script, {suffix}"),
        Some("Shell") => format!("shell script, {suffix}"),
        Some("Bash") => format!("bash script, {suffix}"),
        Some("Zsh") => format!("zsh script, {suffix}"),
        Some("Fish") => format!("fish script, {suffix}"),
        Some("Ksh") => format!("ksh script, {suffix}"),
        Some("JavaScript") => format!("javascript script, {suffix}"),
        Some("TypeScript") => format!("typescript script, {suffix}"),
        Some("PowerShell") => format!("powershell script, {suffix}"),
        Some("Awk") => format!("awk script, {suffix}"),
        _ => format!("script, {suffix}"),
    }
}

fn source_file_type(programming_language: Option<&str>, bytes: &[u8]) -> String {
    let suffix = text_label(bytes);
    match programming_language {
        Some("C") => format!("C source, {suffix}"),
        Some("C++") => format!("C++ source, {suffix}"),
        Some("Java") => format!("Java source, {suffix}"),
        Some("C#") => format!("C# source, {suffix}"),
        Some("F#") => format!("F# source, {suffix}"),
        Some("Go") => format!("Go source, {suffix}"),
        Some("Rust") => format!("Rust source, {suffix}"),
        Some("Starlark") => format!("Starlark source, {suffix}"),
        Some("CMake") => format!("CMake source, {suffix}"),
        Some("Meson") => format!("Meson source, {suffix}"),
        Some("Nix") => format!("Nix source, {suffix}"),
        Some("Groovy") => format!("Groovy source, {suffix}"),
        Some("Makefile") => format!("Makefile source, {suffix}"),
        Some("Dockerfile") => format!("Dockerfile source, {suffix}"),
        Some("Jamfile") => format!("Jamfile source, {suffix}"),
        Some("Batchfile") => format!("Batchfile source, {suffix}"),
        Some(language) => format!("{language} source, {suffix}"),
        None => text_file_type(bytes),
    }
}

fn text_file_type(bytes: &[u8]) -> String {
    text_label(bytes).to_string()
}

fn text_label(bytes: &[u8]) -> &'static str {
    if std::str::from_utf8(bytes).is_ok() {
        if bytes.contains(&b'\n') {
            "UTF-8 Unicode text"
        } else {
            "UTF-8 Unicode text, with no line terminators"
        }
    } else if bytes.contains(&b'\n') {
        "text"
    } else {
        "text, with no line terminators"
    }
}

fn text_executable_label(bytes: &[u8]) -> &'static str {
    if std::str::from_utf8(bytes).is_ok() {
        if bytes.contains(&b'\n') {
            "UTF-8 Unicode text executable"
        } else {
            "UTF-8 Unicode text executable, with no line terminators"
        }
    } else if bytes.contains(&b'\n') {
        "text executable"
    } else {
        "text executable, with no line terminators"
    }
}

fn supported_image_metadata_format(
    ext: Option<&str>,
    detected_format: FileFormat,
) -> Option<ImageFormat> {
    match ext {
        Some("jpg" | "jpeg") => Some(ImageFormat::Jpeg),
        Some("png") => Some(ImageFormat::Png),
        Some("tif" | "tiff") => Some(ImageFormat::Tiff),
        Some("webp") => Some(ImageFormat::WebP),
        _ => match detected_format.media_type() {
            "image/jpeg" => Some(ImageFormat::Jpeg),
            "image/png" => Some(ImageFormat::Png),
            "image/tiff" => Some(ImageFormat::Tiff),
            "image/webp" => Some(ImageFormat::WebP),
            _ => None,
        },
    }
}

fn should_skip_binary_string_extraction(
    path: &Path,
    bytes: &[u8],
    detected_format: FileFormat,
) -> bool {
    matches!(lower_extension(path).as_deref(), Some("pdf"))
        || supported_image_metadata_format(lower_extension(path).as_deref(), detected_format)
            .is_some()
        || (matches!(
            detected_format.kind(),
            FileFormatKind::Audio | FileFormatKind::Image | FileFormatKind::Video
        ) && !is_textual_format(detected_format))
        || media_mime_from_content(bytes).is_some()
        || is_zip_archive(bytes)
        || looks_like_gzip(bytes)
        || looks_like_bzip2(bytes)
        || looks_like_xz(bytes)
        || looks_like_deb(bytes, path)
        || looks_like_rpm(bytes, path)
        || looks_like_squashfs(bytes, path)
}

fn should_skip_large_opaque_binary_text_extraction(
    _path: &Path,
    bytes: &[u8],
    detected_format: FileFormat,
) -> bool {
    is_large_opaque_binary_candidate(bytes, detected_format)
        && !sample_has_promising_printable_strings(bytes)
}

fn is_large_opaque_binary_candidate(bytes: &[u8], detected_format: FileFormat) -> bool {
    bytes.len() >= LARGE_OPAQUE_BINARY_SKIP_BYTES
        && !is_textual_format(detected_format)
        && !matches!(
            detected_format.kind(),
            FileFormatKind::Archive
                | FileFormatKind::Compressed
                | FileFormatKind::Package
                | FileFormatKind::Audio
                | FileFormatKind::Image
                | FileFormatKind::Video
        )
}

fn sampled_printable_window_ranges(len: usize) -> Vec<(usize, usize)> {
    const SAMPLE_WINDOW_BYTES: usize = 64 * 1024;

    let mut ranges = Vec::new();
    let mut push_range = |start: usize, end: usize| {
        if start < end && !ranges.contains(&(start, end)) {
            ranges.push((start, end));
        }
    };

    push_range(0, len.min(SAMPLE_WINDOW_BYTES));
    if len > SAMPLE_WINDOW_BYTES * 2 {
        let mid_start = len / 2 - SAMPLE_WINDOW_BYTES / 2;
        let mid_end = (mid_start + SAMPLE_WINDOW_BYTES).min(len);
        push_range(mid_start, mid_end);
    }
    if len > SAMPLE_WINDOW_BYTES {
        push_range(len - SAMPLE_WINDOW_BYTES, len);
    }

    ranges
}

fn extract_bounded_macho_legal_strings(bytes: &[u8]) -> String {
    if !matches!(
        FileKind::parse(bytes),
        Ok(FileKind::MachO32 | FileKind::MachO64 | FileKind::MachOFat32 | FileKind::MachOFat64)
    ) {
        return String::new();
    }

    let mut ranges = Vec::new();
    for marker in LARGE_MACHO_LEGAL_MARKERS {
        collect_marker_window_ranges(bytes, marker, &mut ranges);
        if ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
            break;
        }
    }

    if ranges.is_empty() {
        return String::new();
    }

    let mut merged_ranges = merge_overlapping_ranges(ranges);
    let mut combined_lines = BTreeSet::new();
    let mut extracted_bytes = 0usize;

    for (start, end) in merged_ranges.drain(..) {
        if extracted_bytes >= LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES {
            break;
        }
        let remaining = LARGE_MACHO_LEGAL_MAX_EXTRACT_BYTES - extracted_bytes;
        let end = start.saturating_add((end - start).min(remaining));
        let window_text = extract_printable_strings(&bytes[start..end]);
        for line in window_text
            .lines()
            .map(str::trim)
            .filter(|line| !line.is_empty())
        {
            combined_lines.insert(line.to_string());
        }
        extracted_bytes += end - start;
    }

    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
}

fn collect_marker_window_ranges(bytes: &[u8], marker: &[u8], ranges: &mut Vec<(usize, usize)>) {
    if marker.is_empty() || ranges.len() >= LARGE_MACHO_LEGAL_MAX_WINDOWS {
        return;
    }

    let mut search_start = 0usize;
    let mut hits_for_marker = 0usize;

    while search_start + marker.len() <= bytes.len()
        && ranges.len() < LARGE_MACHO_LEGAL_MAX_WINDOWS
        && hits_for_marker < LARGE_MACHO_LEGAL_MAX_WINDOWS_PER_MARKER
    {
        let Some(relative_match) = bytes[search_start..].iter().position(|&b| b == marker[0])
        else {
            break;
        };
        let match_start = search_start + relative_match;
        let match_end = match_start + marker.len();
        if match_end <= bytes.len() && &bytes[match_start..match_end] == marker {
            let half_window = LARGE_MACHO_LEGAL_WINDOW_BYTES / 2;
            let window_start = match_start.saturating_sub(half_window);
            let window_end = (match_end + half_window).min(bytes.len());
            ranges.push((window_start, window_end));
            hits_for_marker += 1;
            search_start = match_end;
        } else {
            search_start = match_start + 1;
        }
    }
}

fn merge_overlapping_ranges(mut ranges: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
    if ranges.is_empty() {
        return ranges;
    }

    ranges.sort_unstable_by_key(|&(start, end)| (start, end));

    let mut merged = Vec::with_capacity(ranges.len());
    let mut current = ranges[0];
    for (start, end) in ranges.into_iter().skip(1) {
        if start <= current.1 {
            current.1 = current.1.max(end);
        } else {
            merged.push(current);
            current = (start, end);
        }
    }
    merged.push(current);

    merged
}

fn sample_has_promising_printable_strings(bytes: &[u8]) -> bool {
    let mut structured_signal_seen = false;
    let promising_license_windows = sampled_printable_window_ranges(bytes.len())
        .into_iter()
        .filter(|&(start, end)| {
            let window = &bytes[start..end];
            if has_strong_structured_text_signal(window) {
                structured_signal_seen = true;
            }
            has_license_or_notice_signal(window)
        })
        .count();

    structured_signal_seen || promising_license_windows >= 2
}

fn extract_sampled_printable_strings(bytes: &[u8]) -> String {
    let mut combined_lines = BTreeSet::new();

    for (start, end) in sampled_printable_window_ranges(bytes.len()) {
        let window_text = extract_printable_strings(&bytes[start..end]);
        for line in window_text
            .lines()
            .map(str::trim)
            .filter(|line| !line.is_empty())
        {
            combined_lines.insert(line.to_string());
        }
    }

    combined_lines.into_iter().collect::<Vec<_>>().join("\n")
}

fn has_license_or_notice_signal(bytes: &[u8]) -> bool {
    let strings = extract_printable_strings(bytes);
    if strings.is_empty() {
        return false;
    }

    let lower = strings.to_ascii_lowercase();
    [
        "copyright",
        "license",
        "licensed under",
        "all rights reserved",
        "permission is hereby granted",
        "redistribution and use",
        "spdx-license-identifier",
    ]
    .iter()
    .any(|marker| lower.contains(marker))
}

fn has_strong_structured_text_signal(bytes: &[u8]) -> bool {
    let strings = extract_printable_strings(bytes);
    if strings.is_empty() {
        return false;
    }

    let email_markers = strings.matches('@').count();
    let url_markers = strings.matches("http://").count() + strings.matches("https://").count();

    email_markers + url_markers >= 3
}

fn is_supported_image_container(bytes: &[u8], format: ImageFormat) -> bool {
    match format {
        ImageFormat::Png => bytes.starts_with(b"\x89PNG\r\n\x1a\n"),
        ImageFormat::Jpeg => bytes.starts_with(&[0xff, 0xd8, 0xff]),
        ImageFormat::Tiff => bytes.starts_with(b"II\x2a\x00") || bytes.starts_with(b"MM\x00\x2a"),
        ImageFormat::WebP => {
            bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP"
        }
        _ => false,
    }
}

fn extract_image_metadata_text(bytes: &[u8], format: ImageFormat) -> String {
    let mut values = Vec::new();
    values.extend(extract_exif_metadata_values(bytes));
    values.extend(extract_xmp_metadata_values(bytes, format));
    values_to_text(values)
}

fn extract_exif_metadata_values(bytes: &[u8]) -> Vec<String> {
    let mut cursor = BufReader::new(Cursor::new(bytes));
    let exif = match exif::Reader::new().read_from_container(&mut cursor) {
        Ok(exif) => exif,
        Err(_) => return Vec::new(),
    };

    let mut values = Vec::new();
    for field in exif.fields() {
        let rendered = match field.tag {
            exif::Tag::ImageDescription => Some(format_metadata_field(
                "Description",
                &field.display_value().with_unit(&exif).to_string(),
            )),
            exif::Tag::Copyright => Some(format_metadata_field(
                "Copyright",
                &field.display_value().with_unit(&exif).to_string(),
            )),
            exif::Tag::UserComment => Some(format_metadata_field(
                "Comment",
                &field.display_value().with_unit(&exif).to_string(),
            )),
            exif::Tag::Artist => Some(format_metadata_field(
                "Author",
                &field.display_value().with_unit(&exif).to_string(),
            )),
            _ => None,
        };

        if let Some(rendered) = rendered {
            values.push(rendered);
        }
    }

    values
}

fn extract_xmp_metadata_values(bytes: &[u8], format: ImageFormat) -> Vec<String> {
    let xmp = match extract_raw_xmp_packet(bytes, format) {
        Some(xmp) => xmp,
        None => return Vec::new(),
    };

    parse_xmp_values(&xmp)
}

fn extract_raw_xmp_packet(bytes: &[u8], format: ImageFormat) -> Option<Vec<u8>> {
    let reader = ImageReader::with_format(BufReader::new(Cursor::new(bytes)), format);
    if let Ok(mut decoder) = reader.into_decoder()
        && let Ok(Some(xmp)) = decoder.xmp_metadata()
    {
        return (xmp.len() <= MAX_XMP_PACKET_BYTES).then_some(xmp);
    }

    match format {
        ImageFormat::Png => extract_png_xmp_packet(bytes),
        _ => None,
    }
}

fn extract_png_xmp_packet(bytes: &[u8]) -> Option<Vec<u8>> {
    const PNG_SIGNATURE: &[u8; 8] = b"\x89PNG\r\n\x1a\n";

    if bytes.len() < PNG_SIGNATURE.len() || &bytes[..PNG_SIGNATURE.len()] != PNG_SIGNATURE {
        return None;
    }

    let mut offset = PNG_SIGNATURE.len();
    while offset + 12 <= bytes.len() {
        let length = u32::from_be_bytes([
            bytes[offset],
            bytes[offset + 1],
            bytes[offset + 2],
            bytes[offset + 3],
        ]) as usize;
        let chunk_start = offset + 8;
        let chunk_end = chunk_start + length;
        if chunk_end + 4 > bytes.len() {
            return None;
        }

        let chunk_type = &bytes[offset + 4..offset + 8];
        if chunk_type == b"iTXt" {
            let data = &bytes[chunk_start..chunk_end];
            if let Some(xmp) = parse_png_itxt_xmp(data) {
                return Some(xmp);
            }
        }

        offset = chunk_end + 4;
    }

    None
}

fn parse_png_itxt_xmp(data: &[u8]) -> Option<Vec<u8>> {
    const XMP_KEYWORD: &[u8] = b"XML:com.adobe.xmp";

    let keyword_end = data.iter().position(|&b| b == 0)?;
    if &data[..keyword_end] != XMP_KEYWORD {
        return None;
    }

    let mut cursor = keyword_end + 1;
    let compression_flag = *data.get(cursor)?;
    cursor += 1;
    let compression_method = *data.get(cursor)?;
    cursor += 1;
    if compression_flag > 1 || (compression_flag == 1 && compression_method != 0) {
        return None;
    }

    let language_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
    cursor = language_end + 1;

    let translated_end = cursor + data[cursor..].iter().position(|&b| b == 0)?;
    cursor = translated_end + 1;

    let text_bytes = &data[cursor..];
    if compression_flag == 1 {
        let decoder = ZlibDecoder::new(text_bytes);
        let mut decoded = Vec::new();
        decoder
            .take((MAX_XMP_PACKET_BYTES + 1) as u64)
            .read_to_end(&mut decoded)
            .ok()?;
        (decoded.len() <= MAX_XMP_PACKET_BYTES).then_some(decoded)
    } else {
        (text_bytes.len() <= MAX_XMP_PACKET_BYTES).then(|| text_bytes.to_vec())
    }
}

fn parse_xmp_values(xmp: &[u8]) -> Vec<String> {
    let mut reader = XmlReader::from_reader(xmp);
    reader.config_mut().trim_text(true);

    let mut buf = Vec::new();
    let mut stack: Vec<String> = Vec::new();
    let mut values = Vec::new();

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(Event::Start(e)) => {
                stack.push(local_xml_name(e.name().as_ref()));
            }
            Ok(Event::End(_)) => {
                stack.pop();
            }
            Ok(Event::Empty(_)) => {}
            Ok(Event::Text(text)) => {
                if let Some(field) = stack
                    .iter()
                    .rev()
                    .find_map(|name| allowed_xmp_field(name.as_str()))
                    && let Ok(decoded) = text.decode()
                {
                    let decoded = decoded.into_owned();
                    if !decoded.trim().is_empty() {
                        values.push(format_xmp_value(field, &decoded));
                    }
                }
            }
            Ok(Event::CData(text)) => {
                if let Some(field) = stack
                    .iter()
                    .rev()
                    .find_map(|name| allowed_xmp_field(name.as_str()))
                    && let Ok(decoded) = text.decode()
                {
                    let decoded = decoded.into_owned();
                    if !decoded.trim().is_empty() {
                        values.push(format_xmp_value(field, &decoded));
                    }
                }
            }
            Ok(Event::Eof) | Err(_) => break,
            _ => {}
        }
        buf.clear();
    }

    values
}

fn local_xml_name(name: &[u8]) -> String {
    let name = std::str::from_utf8(name).unwrap_or_default();
    name.rsplit(':').next().unwrap_or(name).to_string()
}

fn allowed_xmp_field(name: &str) -> Option<&'static str> {
    match name {
        "creator" => Some("creator"),
        "rights" => Some("rights"),
        "description" => Some("description"),
        "title" => Some("title"),
        "subject" => Some("subject"),
        "UsageTerms" => Some("usage_terms"),
        "WebStatement" => Some("web_statement"),
        _ => None,
    }
}

fn format_xmp_value(field: &str, value: &str) -> String {
    match field {
        "creator" => format_metadata_field("Author", value),
        "rights" => format_metadata_field("Copyright", value),
        "description" => format_metadata_field("Description", value),
        "title" => format_metadata_field("Title", value),
        "subject" => format_metadata_field("Subject", value),
        "usage_terms" => format_metadata_field("UsageTerms", value),
        "web_statement" => format_metadata_field("WebStatement", value),
        _ => value.to_string(),
    }
}

fn format_metadata_field(label: &str, value: &str) -> String {
    format!("{label}: {value}")
}

fn values_to_text(values: Vec<String>) -> String {
    let mut seen = BTreeSet::new();
    let mut normalized_lines = Vec::new();

    for value in values {
        let normalized = normalize_metadata_value(&value);
        if normalized.is_empty() || !seen.insert(normalized.clone()) {
            continue;
        }

        normalized_lines.push(normalized);
    }

    let author_values: BTreeSet<String> = normalized_lines
        .iter()
        .filter_map(|line| split_metadata_field(line))
        .filter(|(label, _)| label.eq_ignore_ascii_case("Author"))
        .map(|(_, value)| value.to_string())
        .collect();

    let mut lines = Vec::new();
    let mut total_bytes = 0usize;

    for normalized in normalized_lines {
        if lines.len() >= MAX_IMAGE_METADATA_VALUES {
            break;
        }

        if should_suppress_bare_copyright_metadata_line(&normalized, &author_values) {
            continue;
        }

        let added_bytes = normalized.len() + usize::from(!lines.is_empty());
        if total_bytes + added_bytes > MAX_IMAGE_METADATA_TEXT_BYTES {
            break;
        }

        total_bytes += added_bytes;
        lines.push(normalized);
    }

    lines.join("\n")
}

fn split_metadata_field(line: &str) -> Option<(&str, &str)> {
    let (label, value) = line.split_once(':')?;
    Some((label.trim(), value.trim()))
}

fn should_suppress_bare_copyright_metadata_line(
    line: &str,
    author_values: &BTreeSet<String>,
) -> bool {
    let Some((label, value)) = split_metadata_field(line) else {
        return false;
    };
    if !label.eq_ignore_ascii_case("Copyright")
        || value.is_empty()
        || !author_values.contains(value)
    {
        return false;
    }

    let lower = value.to_ascii_lowercase();
    !lower.contains("copyright")
        && !lower.contains("(c)")
        && !lower.contains('©')
        && !lower.contains("all rights")
        && !value.chars().any(|ch| ch.is_ascii_digit())
}

fn normalize_metadata_value(value: &str) -> String {
    value
        .chars()
        .filter(|&ch| ch != '\0')
        .collect::<String>()
        .split_whitespace()
        .collect::<Vec<_>>()
        .join(" ")
        .trim()
        .to_string()
}

fn extract_pdf_text(path: &Path, bytes: &[u8]) -> (String, Option<String>) {
    if bytes.len() < 5 || &bytes[..5] != b"%PDF-" {
        return (String::new(), None);
    }

    if bytes.len() > MAX_PDF_TEXT_EXTRACTION_BYTES {
        return (
            String::new(),
            Some(format!(
                "PDF text extraction skipped because file exceeds {} bytes",
                MAX_PDF_TEXT_EXTRACTION_BYTES
            )),
        );
    }

    let mut failures = Vec::new();
    let mut saw_success = false;

    let extracted = catch_unwind(AssertUnwindSafe(
        || -> Result<String, Box<dyn std::error::Error>> {
            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
            extract_first_pdf_page_text(&mut document)
        },
    ));
    match extracted {
        Ok(Ok(text)) => {
            saw_success = true;
            if let Some(normalized) = normalize_pdf_text(text) {
                return (normalized, None);
            }
        }
        Ok(Err(err)) => failures.push(format!("from-bytes first-page: {err}")),
        Err(payload) => failures.push(format!(
            "from-bytes first-page panic: {}",
            panic_payload_to_string(payload.as_ref())
        )),
    }

    let extracted = catch_unwind(AssertUnwindSafe(
        || -> Result<String, Box<dyn std::error::Error>> {
            let mut document = pdf_oxide::document::PdfDocument::open(path)?;
            extract_pdf_text_from_document(&mut document)
        },
    ));
    match extracted {
        Ok(Ok(text)) => {
            saw_success = true;
            if let Some(normalized) = normalize_pdf_text(text) {
                return (normalized, None);
            }
        }
        Ok(Err(err)) => failures.push(format!("open full-document: {err}")),
        Err(payload) => failures.push(format!(
            "open full-document panic: {}",
            panic_payload_to_string(payload.as_ref())
        )),
    }

    let extracted = catch_unwind(AssertUnwindSafe(
        || -> Result<String, Box<dyn std::error::Error>> {
            let mut document = pdf_oxide::document::PdfDocument::from_bytes(bytes.to_vec())?;
            extract_pdf_text_from_document(&mut document)
        },
    ));
    match extracted {
        Ok(Ok(text)) => {
            saw_success = true;
            if let Some(normalized) = normalize_pdf_text(text) {
                return (normalized, None);
            }
        }
        Ok(Err(err)) => failures.push(format!("from-bytes full-document: {err}")),
        Err(payload) => failures.push(format!(
            "from-bytes full-document panic: {}",
            panic_payload_to_string(payload.as_ref())
        )),
    }

    if saw_success || is_non_actionable_pdf_failure(&failures) {
        (String::new(), None)
    } else {
        (
            String::new(),
            Some(format!(
                "PDF text extraction failed after {} attempts: {}",
                failures.len(),
                failures.join("; ")
            )),
        )
    }
}

fn is_non_actionable_pdf_failure(failures: &[String]) -> bool {
    !failures.is_empty()
        && failures.iter().all(|failure| {
            failure.contains("requires a password")
                || failure.contains("Encrypt dictionary missing /O")
                || failure.contains("Encrypt dictionary missing /U")
                || failure.contains("security handler cannot be found")
                || failure.contains("Invalid cross-reference table")
        })
}

fn panic_payload_to_string(payload: &(dyn std::any::Any + Send)) -> String {
    if let Some(message) = payload.downcast_ref::<&str>() {
        (*message).to_string()
    } else if let Some(message) = payload.downcast_ref::<String>() {
        message.clone()
    } else {
        "unknown panic payload".to_string()
    }
}

fn extract_first_pdf_page_text(
    document: &mut pdf_oxide::document::PdfDocument,
) -> Result<String, Box<dyn std::error::Error>> {
    if document.page_count()? == 0 {
        return Ok(String::new());
    }

    let extracted_text = document.extract_text(0)?;
    let markdown_text =
        document.to_markdown(0, &pdf_oxide::converters::ConversionOptions::default())?;
    if pdf_markdown_heading_lines(&markdown_text).is_empty() {
        return Ok(extracted_text);
    }

    let pipeline_text =
        document.to_plain_text(0, &pdf_oxide::converters::ConversionOptions::default())?;

    Ok(merge_pdf_first_page_text(
        &extracted_text,
        &markdown_text,
        &pipeline_text,
    ))
}

fn extract_pdf_text_from_document(
    document: &mut pdf_oxide::document::PdfDocument,
) -> Result<String, Box<dyn std::error::Error>> {
    Ok(document.to_plain_text_all(&pdf_oxide::converters::ConversionOptions::default())?)
}

fn normalize_pdf_text(text: String) -> Option<String> {
    let normalized = text.replace(['\r', '\u{0c}'], "\n");
    (!normalized.trim().is_empty()).then_some(normalized)
}

fn merge_pdf_first_page_text(
    _extracted_text: &str,
    markdown_text: &str,
    pipeline_text: &str,
) -> String {
    let pipeline = pipeline_text.trim();
    if pipeline.is_empty() {
        return String::new();
    }

    let prefix = pdf_first_page_heading_prefix(markdown_text);
    let Some(prefix) = prefix else {
        return pipeline_text.to_string();
    };

    if pdf_text_contains_heading_prefix(pipeline, &prefix) {
        pipeline_text.to_string()
    } else {
        format!("{prefix}\n\n{pipeline}")
    }
}

fn pdf_text_contains_heading_prefix(text: &str, prefix: &str) -> bool {
    normalize_pdf_heading_comparison_text(text)
        .contains(&normalize_pdf_heading_comparison_text(prefix))
}

fn normalize_pdf_heading_comparison_text(text: &str) -> String {
    text.split_whitespace()
        .map(|part| part.to_ascii_lowercase())
        .collect::<Vec<_>>()
        .join(" ")
}

fn pdf_first_page_heading_prefix(markdown_text: &str) -> Option<String> {
    let mut lines = Vec::new();

    for line in pdf_markdown_heading_lines(markdown_text) {
        push_unique_line(&mut lines, line);
    }

    (!lines.is_empty()).then(|| lines.join("\n"))
}

fn pdf_markdown_heading_lines(text: &str) -> Vec<String> {
    text.lines()
        .map(str::trim)
        .filter_map(|line| line.strip_prefix('#').map(str::trim_start))
        .map(|line| line.trim_matches('#').trim())
        .filter(|line| !line.is_empty())
        .filter(|line| !looks_like_numbered_section_heading(line))
        .take(4)
        .map(ToOwned::to_owned)
        .collect()
}

fn push_unique_line(lines: &mut Vec<String>, line: String) {
    if !lines.iter().any(|existing| existing == &line) {
        lines.push(line);
    }
}

fn looks_like_numbered_section_heading(line: &str) -> bool {
    let mut chars = line.chars();
    let Some(first) = chars.next() else {
        return false;
    };

    if !first.is_ascii_digit() {
        return false;
    }

    matches!(chars.next(), Some('.'))
}

fn is_zip_archive(bytes: &[u8]) -> bool {
    bytes.starts_with(b"PK\x03\x04")
        || bytes.starts_with(b"PK\x05\x06")
        || bytes.starts_with(b"PK\x07\x08")
}

pub fn extract_printable_strings(bytes: &[u8]) -> String {
    const MIN_LEN: usize = 4;
    const MIN_OUTPUT_BYTES: usize = 2_000_000;
    const MAX_OUTPUT_BYTES_CAP: usize = 16_000_000;

    let max_output_bytes = bytes.len().clamp(MIN_OUTPUT_BYTES, MAX_OUTPUT_BYTES_CAP);

    fn is_printable_ascii(b: u8) -> bool {
        matches!(b, 0x20..=0x7E)
    }

    let mut out = String::new();
    let mut run: Vec<u8> = Vec::new();

    let flush_run = |out: &mut String, run: &mut Vec<u8>| {
        if run.len() >= MIN_LEN {
            if !out.is_empty() {
                out.push('\n');
            }
            out.push_str(&String::from_utf8_lossy(run));
        }
        run.clear();
    };

    for &b in bytes {
        if is_printable_ascii(b) {
            run.push(b);
        } else {
            flush_run(&mut out, &mut run);
            if out.len() >= max_output_bytes {
                return out;
            }
        }
    }
    flush_run(&mut out, &mut run);
    if out.len() >= max_output_bytes {
        return out;
    }

    for start in 0..=1 {
        run.clear();
        let mut i = start;
        while i + 1 < bytes.len() {
            let b0 = bytes[i];
            let b1 = bytes[i + 1];
            let (ch, zero) = if start == 0 { (b0, b1) } else { (b1, b0) };
            if is_printable_ascii(ch) && zero == 0 {
                run.push(ch);
            } else {
                flush_run(&mut out, &mut run);
                if out.len() >= max_output_bytes {
                    return out;
                }
            }
            i += 2;
        }
        flush_run(&mut out, &mut run);
        if out.len() >= max_output_bytes {
            return out;
        }
    }

    out
}

#[cfg(test)]
mod tests {
    use image::ImageFormat;
    use std::path::Path;

    use crate::copyright::detect_copyrights;

    use super::{
        ExtractedTextKind, LARGE_OPAQUE_BINARY_SKIP_BYTES, MAX_PDF_TEXT_EXTRACTION_BYTES,
        MAX_XMP_PACKET_BYTES, classify_file_info, extract_printable_strings,
        extract_raw_xmp_packet, extract_text_for_detection,
        extract_text_for_detection_with_diagnostics, format_metadata_field, format_xmp_value,
        is_non_actionable_pdf_failure, normalize_mime_type, normalize_pdf_heading_comparison_text,
        values_to_text, windows_metadata_or_empty_result,
    };

    fn png_chunk(chunk_type: &[u8; 4], data: &[u8]) -> Vec<u8> {
        let mut out = Vec::new();
        out.extend_from_slice(&(data.len() as u32).to_be_bytes());
        out.extend_from_slice(chunk_type);
        out.extend_from_slice(data);
        out.extend_from_slice(&0u32.to_be_bytes());
        out
    }

    fn build_png_with_xmp(xmp: &str) -> Vec<u8> {
        let mut bytes = Vec::new();
        bytes.extend_from_slice(b"\x89PNG\r\n\x1a\n");

        let ihdr = [
            0, 0, 0, 1, // width
            0, 0, 0, 1, // height
            8, // bit depth
            2, // color type
            0, // compression
            0, // filter
            0, // interlace
        ];
        bytes.extend_from_slice(&png_chunk(b"IHDR", &ihdr));

        let mut itxt = Vec::new();
        itxt.extend_from_slice(b"XML:com.adobe.xmp");
        itxt.push(0); // keyword terminator
        itxt.push(0); // compression flag
        itxt.push(0); // compression method
        itxt.push(0); // language tag terminator
        itxt.push(0); // translated keyword terminator
        itxt.extend_from_slice(xmp.as_bytes());
        bytes.extend_from_slice(&png_chunk(b"iTXt", &itxt));

        bytes.extend_from_slice(&png_chunk(b"IEND", &[]));
        bytes
    }

    #[test]
    fn test_extract_text_for_detection_skips_jar_archives() {
        let path = Path::new(
            "testdata/license-golden/datadriven/lic1/do-not_detect-licenses-in-archive.jar",
        );
        let bytes = std::fs::read(path).expect("failed to read jar fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_reads_pdf_fixture_text() {
        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
        let bytes = std::fs::read(path).expect("failed to read pdf fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Pdf);
        assert!(text.contains("Redistribution and use in source and binary forms"));
    }

    #[test]
    fn test_extract_text_for_detection_prefers_first_pdf_page_before_full_document() {
        let path =
            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
        let bytes = std::fs::read(path).expect("failed to read pdf fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Pdf);
        assert!(text.contains("SUN INDUSTRY STANDARDS SOURCE LICENSE"));
        assert!(!text.contains("DISCLAIMER OF WARRANTY"));
    }

    #[test]
    fn test_extract_text_for_detection_does_not_duplicate_pdf_heading_prefix() {
        let path =
            Path::new("testdata/license-golden/datadriven/lic4/should_detect_something_5.pdf");
        let bytes = std::fs::read(path).expect("failed to read pdf fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Pdf);

        let normalized = normalize_pdf_heading_comparison_text(&text);
        let heading =
            normalize_pdf_heading_comparison_text("SUN INDUSTRY STANDARDS SOURCE LICENSE");
        assert_eq!(normalized.matches(&heading).count(), 1);
    }

    #[test]
    fn test_extract_text_for_detection_reads_pdf_fixture_without_pdf_extension() {
        let path = Path::new("testdata/license-golden/datadriven/lic2/bsd-new_156.pdf");
        let bytes = std::fs::read(path).expect("failed to read pdf fixture");

        let (text, kind) = extract_text_for_detection(Path::new("renamed.bin"), &bytes);

        assert_eq!(kind, ExtractedTextKind::Pdf);
        assert!(text.contains("Redistribution and use in source and binary forms"));
    }

    #[test]
    fn test_extract_text_for_detection_skips_oversized_pdf_payload() {
        let mut bytes = b"%PDF-1.7\n".to_vec();
        bytes.resize(MAX_PDF_TEXT_EXTRACTION_BYTES + 1, b'0');

        let (text, kind, scan_error) =
            extract_text_for_detection_with_diagnostics(Path::new("oversized.pdf"), &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
        assert!(
            scan_error
                .as_deref()
                .is_some_and(|message| message.contains("PDF text extraction skipped"))
        );
    }

    #[test]
    fn test_extract_text_for_detection_reports_terminal_pdf_failure() {
        let malformed = b"%PDF-1.7\nthis is not a valid pdf object graph\n";

        let (text, kind, scan_error) =
            extract_text_for_detection_with_diagnostics(Path::new("broken.pdf"), malformed);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
        let scan_error = scan_error.expect("terminal pdf failure should be surfaced");
        assert!(scan_error.contains("PDF text extraction failed after"));
    }

    #[test]
    fn test_extract_text_for_detection_skips_large_opaque_binary_blobs() {
        let bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];

        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_keeps_large_binaries_with_promising_strings() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
        let text = b"Copyright 2026 Example Project!!!";
        bytes[..text.len()].copy_from_slice(text);
        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
        bytes[second_offset..second_offset + text.len()].copy_from_slice(text);

        let (text, kind) = extract_text_for_detection(Path::new("weights.bin"), &bytes);

        assert_ne!(kind, ExtractedTextKind::None);
        assert!(text.contains("Copyright 2026 Example Project"));
    }

    #[test]
    fn test_extract_text_for_detection_skips_large_binary_with_unstructured_runs() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
        let noise = b"(c) $1234567890ABCDEF[]{}--==++";
        bytes[..noise.len()].copy_from_slice(noise);
        let second_offset = LARGE_OPAQUE_BINARY_SKIP_BYTES / 2;
        bytes[second_offset..second_offset + noise.len()].copy_from_slice(noise);

        let (text, kind) = extract_text_for_detection(Path::new("tensor.bin"), &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_uses_windows_executable_metadata() {
        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
        let bytes = std::fs::read(path).expect("read PE fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
        assert!(text.contains("License: This program is free software"));
        assert!(text.contains("LegalCopyright:"));
    }

    #[test]
    fn test_extract_text_for_detection_keeps_windows_metadata_for_large_pe_without_sampled_signal()
    {
        let path = Path::new("testdata/compiled-binary-golden/win_pe/libiconv2.dll");
        let mut bytes = std::fs::read(path).expect("read PE fixture");
        bytes.resize(LARGE_OPAQUE_BINARY_SKIP_BYTES + 8, 0);

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_ne!(kind, ExtractedTextKind::None);
        assert!(!text.trim().is_empty());
    }

    #[test]
    fn test_windows_metadata_or_empty_result_preserves_metadata() {
        let (text, kind, scan_error) =
            windows_metadata_or_empty_result(Some("LegalCopyright: Example Corp".to_string()));

        assert_eq!(kind, ExtractedTextKind::WindowsExecutableMetadata);
        assert_eq!(text, "LegalCopyright: Example Corp");
        assert!(scan_error.is_none());
    }

    #[test]
    fn test_format_xmp_value_labels_creator_and_title_fields() {
        assert_eq!(
            format_xmp_value("creator", "Chinmay Garde"),
            "Author: Chinmay Garde"
        );
        assert_eq!(
            format_xmp_value("title", "Bay Bridge At Night"),
            "Title: Bay Bridge At Night"
        );
        assert_eq!(
            format_xmp_value("description", "Embarcadero in the evening on Delta 3200"),
            "Description: Embarcadero in the evening on Delta 3200"
        );
    }

    #[test]
    fn test_format_metadata_field_prefixes_exif_text() {
        assert_eq!(
            format_metadata_field("Author", "Chinmay Garde"),
            "Author: Chinmay Garde"
        );
        assert_eq!(
            format_metadata_field("Description", "Bay Bridge At Night"),
            "Description: Bay Bridge At Night"
        );
    }

    #[test]
    fn test_extract_text_for_detection_keeps_image_author_separate_from_title_and_description() {
        let xmp = r#"<x:xmpmeta xmlns:x="adobe:ns:meta/"><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"><rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:creator>Chinmay Garde</dc:creator><dc:title>Bay Bridge At Night</dc:title><dc:description>Embarcadero in the evening on Delta 3200</dc:description></rdf:Description></rdf:RDF></x:xmpmeta>"#;
        let bytes = build_png_with_xmp(xmp);

        let (text, kind) = extract_text_for_detection(Path::new("fixture.png"), &bytes);

        assert_eq!(kind, ExtractedTextKind::ImageMetadata);
        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
        assert!(
            text.contains("Title: Bay Bridge At Night"),
            "text: {text:?}"
        );
        assert!(
            text.contains("Description: Embarcadero in the evening on Delta 3200"),
            "text: {text:?}"
        );

        let (_copyrights, _holders, authors) = detect_copyrights(&text, None);
        assert_eq!(
            authors
                .iter()
                .map(|a| a.author.as_str())
                .collect::<Vec<_>>(),
            vec!["Chinmay Garde"],
            "authors: {authors:?}; text: {text:?}"
        );
    }

    #[test]
    fn test_values_to_text_suppresses_bare_copyright_duplicate_of_author() {
        let text = values_to_text(vec![
            "Author: Chinmay Garde".to_string(),
            "Copyright: Chinmay Garde".to_string(),
            "Title: Bay Bridge At Night".to_string(),
        ]);

        assert!(text.contains("Author: Chinmay Garde"), "text: {text:?}");
        assert!(
            text.contains("Title: Bay Bridge At Night"),
            "text: {text:?}"
        );
        assert!(!text.contains("Copyright: Chinmay Garde"), "text: {text:?}");
    }

    #[test]
    fn test_extract_text_for_detection_skips_large_binary_with_single_isolated_string_run() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
        let text = b"Copyright 2026 Example Project!!!";
        bytes[..text.len()].copy_from_slice(text);

        let (text, kind) = extract_text_for_detection(Path::new("opaque.bin"), &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_keeps_large_binary_with_single_contact_rich_window() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES + 8];
        let text = b"Andreas Schneider <asn@redhat.com> Rob Crittenden (rcritten@redhat.com) Mr. Sam <sam@email-scan.com> https://publicsuffix.org/ http://tukaani.org/xz/";
        bytes[..text.len()].copy_from_slice(text);

        let (text, kind) = extract_text_for_detection(Path::new("rootfs.bin"), &bytes);

        assert_ne!(kind, ExtractedTextKind::None);
        assert!(text.contains("asn@redhat.com"));
        assert!(text.contains("https://publicsuffix.org/"));
    }

    #[test]
    fn test_extract_text_for_detection_keeps_large_macho_with_off_window_legal_markers() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
        bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
        let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
        let insert_offset = 200 * 1024;
        bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);

        let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);

        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
        assert!(text.contains("Apache License, Version 2.0"), "{text}");
        assert!(
            text.contains("SPDX-License-Identifier: Apache-2.0"),
            "{text}"
        );
    }

    #[test]
    fn test_extract_text_for_detection_keeps_large_macho_with_unicode_notice_markers() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
        bytes[..4].copy_from_slice(&[0xCF, 0xFA, 0xED, 0xFE]);
        let unicode_notice = b"Copyright (c) 1991-2024 Unicode, Inc.\nFor terms of use, see http://www.unicode.org/copyright.html\n";
        let insert_offset = 700 * 1024;
        bytes[insert_offset..insert_offset + unicode_notice.len()].copy_from_slice(unicode_notice);

        let (text, kind) = extract_text_for_detection(Path::new("node"), &bytes);

        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
        assert!(text.contains("Unicode, Inc."), "{text}");
        assert!(text.contains("unicode.org/copyright.html"), "{text}");
    }

    #[test]
    fn test_extract_text_for_detection_does_not_reopen_single_window_legal_noise_for_non_macho() {
        let mut bytes = vec![0_u8; LARGE_OPAQUE_BINARY_SKIP_BYTES * 2];
        let apache_notice = b"// Licensed under the Apache License, Version 2.0 (the \"License\");\n// http://www.apache.org/licenses/LICENSE-2.0\n// SPDX-License-Identifier: Apache-2.0\n";
        let insert_offset = 200 * 1024;
        bytes[insert_offset..insert_offset + apache_notice.len()].copy_from_slice(apache_notice);

        let (text, kind) = extract_text_for_detection(Path::new("model.bin"), &bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_avoids_latin1_decode_for_binary_blob_noise() {
        let bytes = vec![
            0x28, 0x63, 0x29, 0x20, 0x4b, 0x30, 0x0e, 0x71, 0x86, 0x20, 0x62, 0x24, 0x4c,
        ];

        let (text, kind) = extract_text_for_detection(Path::new("fixture.blb"), &bytes);

        assert_eq!(kind, ExtractedTextKind::BinaryStrings);
        assert_eq!(text, "(c) K0\n b$L");
    }

    #[test]
    fn test_extract_raw_xmp_packet_rejects_oversized_png_itxt_payload() {
        let xmp = "A".repeat(MAX_XMP_PACKET_BYTES + 1);
        let bytes = build_png_with_xmp(&xmp);

        assert!(extract_raw_xmp_packet(&bytes, ImageFormat::Png).is_none());
    }

    #[test]
    fn test_non_actionable_pdf_failures_are_suppressed() {
        assert!(is_non_actionable_pdf_failure(&[
            "from-bytes first-page: PDF is encrypted and requires a password".to_string(),
            "open full-document: PDF is encrypted and requires a password".to_string(),
        ]));
        assert!(is_non_actionable_pdf_failure(&[
            "from-bytes first-page: Invalid cross-reference table".to_string(),
            "open full-document: Invalid cross-reference table".to_string(),
        ]));
        assert!(is_non_actionable_pdf_failure(&[
            "from-bytes first-page: Invalid PDF: Encrypt dictionary missing /O".to_string(),
            "open full-document: Invalid PDF: security handler cannot be found".to_string(),
        ]));
        assert!(!is_non_actionable_pdf_failure(&[
            "from-bytes first-page: some other parser failure".to_string(),
        ]));
    }

    #[test]
    fn test_extract_text_for_detection_skips_zip_like_archives() {
        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00\x08\x00artifact";

        let (whl_text, whl_kind) = extract_text_for_detection(Path::new("demo.whl"), zip_bytes);
        let (crate_text, crate_kind) =
            extract_text_for_detection(Path::new("demo.crate"), zip_bytes);

        assert!(whl_text.is_empty());
        assert_eq!(whl_kind, ExtractedTextKind::None);
        assert!(crate_text.is_empty());
        assert_eq!(crate_kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_extract_text_for_detection_keeps_binary_strings_for_lib_fixtures() {
        let path =
            Path::new("testdata/copyright-golden/copyrights/copyright_php_lib-php_embed_lib.lib");
        let bytes = std::fs::read(path).expect("failed to read lib fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_ne!(kind, ExtractedTextKind::None);
        assert!(text.contains("Copyright nexB and others (c) 2012"));
    }

    #[test]
    fn test_extract_text_for_detection_reads_font_metadata() {
        let path = Path::new("testdata/font-fixtures/Lato-Bold.ttf");
        let bytes = std::fs::read(path).expect("failed to read font fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::FontMetadata);
        assert!(text.contains("License Description:"), "{text}");
        assert!(
            text.contains("Open Font License") || text.contains("OFL"),
            "{text}"
        );
        assert!(text.contains("Lato"), "{text}");
    }

    #[test]
    fn test_extract_printable_strings_scales_cap_for_medium_binary_files() {
        let bytes = b"abcd\0".repeat(525_000);

        let text = extract_printable_strings(&bytes);

        assert!(
            text.len() > 2_000_000,
            "unexpected truncation at {}",
            text.len()
        );
        assert!(text.ends_with("abcd"));
    }

    #[test]
    fn test_extract_text_for_detection_decodes_svg_fixture_text() {
        let path = Path::new(
            "testdata/license-golden/datadriven/external/fossology-tests/Public-domain/biohazard.svg",
        );
        let bytes = std::fs::read(path).expect("failed to read svg fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Decoded);
        assert!(text.contains("creativecommons.org/licenses/publicdomain"));
    }

    #[test]
    fn test_extract_text_for_detection_preserves_blank_comment_lines_in_utf8_source() {
        let path = Path::new("testdata/plugin_email_url/files/IMarkerActionFilter.java");
        let bytes = std::fs::read(path).expect("failed to read java fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Decoded);
        let lines: Vec<_> = text.lines().collect();
        assert_eq!(lines.get(2).copied(), Some(" *"));
        assert_eq!(
            lines.get(3).copied(),
            Some(" *https://github.com/rpm-software-management")
        );
        assert_eq!(lines.get(5).copied(), Some("https://gitlab.com/Conan_Kudo"));
    }

    #[test]
    fn test_extract_text_for_detection_decodes_rtf_fixture_text() {
        let path = Path::new(
            "testdata/license-golden/datadriven/external/fossology-tests/LGPL/License.rtf",
        );
        let bytes = std::fs::read(path).expect("failed to read rtf fixture");

        let (text, kind) = extract_text_for_detection(path, &bytes);

        assert_eq!(kind, ExtractedTextKind::Decoded);
        assert!(text.contains("GNU Lesser General Public"));
        assert!(text.contains("version"));
        assert!(text.contains("2.1 of the License"));
    }

    #[test]
    fn test_normalize_mime_type_prefers_text_for_textual_video_guess() {
        assert_eq!(
            normalize_mime_type(
                Path::new("main.ts"),
                b"export const answer = 42;\n",
                Some("TypeScript"),
                "video/mp2t",
            ),
            "text/plain"
        );
    }

    #[test]
    fn test_normalize_mime_type_prefers_text_for_octet_stream_source_guess() {
        assert_eq!(
            normalize_mime_type(
                Path::new("main.js"),
                b"console.log('hello');\n",
                Some("JavaScript"),
                "application/octet-stream",
            ),
            "text/plain"
        );
    }

    #[test]
    fn test_normalize_mime_type_preserves_binary_video_guess() {
        assert_eq!(
            normalize_mime_type(
                Path::new("main.ts"),
                &[0, 159, 146, 150, 0, 1, 2, 3],
                Some("TypeScript"),
                "video/mp2t",
            ),
            "video/mp2t"
        );
    }

    #[test]
    fn test_normalize_mime_type_preserves_short_binary_octet_stream_guess() {
        assert_eq!(
            normalize_mime_type(
                Path::new("main.ts"),
                &[0, 159, 146, 150],
                Some("TypeScript"),
                "application/octet-stream",
            ),
            "application/octet-stream"
        );
    }

    #[test]
    fn test_classify_file_info_marks_empty_files_as_text_not_source() {
        let classification = classify_file_info(Path::new("test.txt"), b"");

        assert_eq!(classification.mime_type, "inode/x-empty");
        assert_eq!(classification.file_type, "empty");
        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert!(!classification.is_source);
        assert_eq!(classification.programming_language, None);
    }

    #[test]
    fn test_classify_file_info_keeps_json_out_of_programming_language() {
        let classification = classify_file_info(Path::new("package.json"), br#"{"name":"demo"}"#);

        assert_eq!(classification.mime_type, "application/json");
        assert_eq!(classification.file_type, "JSON text data");
        assert!(classification.is_text);
        assert!(!classification.is_source);
        assert_eq!(classification.programming_language, None);
    }

    #[test]
    fn test_classify_file_info_does_not_label_invalid_json_text_as_json() {
        let classification =
            classify_file_info(Path::new("broken.json"), b"{ definitely not json\n");

        assert_eq!(classification.mime_type, "text/plain");
        assert_eq!(classification.file_type, "UTF-8 Unicode text");
        assert!(classification.is_text);
        assert!(!classification.is_binary);
    }

    #[test]
    fn test_classify_file_info_does_not_label_binary_json_garbage_as_json() {
        let classification =
            classify_file_info(Path::new("broken.json"), &[0xff, 0x00, 0x01, 0x02]);

        assert_eq!(classification.mime_type, "application/octet-stream");
        assert_eq!(classification.file_type, "data");
        assert!(classification.is_binary);
        assert!(!classification.is_text);
    }

    #[test]
    fn test_classify_file_info_treats_valid_utf16_json_with_bom_as_text() {
        let classification = classify_file_info(
            Path::new("utf16.json"),
            &[
                0xFF, 0xFE, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D, 0x00,
            ],
        );

        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert_eq!(classification.mime_type, "application/json");
        assert_eq!(classification.file_type, "JSON text data");
    }

    #[test]
    fn test_classify_file_info_treats_valid_utf16be_json_without_bom_as_text() {
        let classification = classify_file_info(
            Path::new("utf16be.json"),
            &[0x00, 0x5B, 0x00, 0x22, 0x00, 0xE9, 0x00, 0x22, 0x00, 0x5D],
        );

        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert_eq!(classification.mime_type, "application/json");
        assert_eq!(classification.file_type, "JSON text data");
    }

    #[test]
    fn test_classify_file_info_treats_small_valid_utf16be_json_literal_as_text() {
        let classification =
            classify_file_info(Path::new("utf16be-literal.json"), &[0x00, 0x5B, 0x00, 0x5D]);

        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert_eq!(classification.mime_type, "application/json");
        assert_eq!(classification.file_type, "JSON text data");
    }

    #[test]
    fn test_extract_text_for_detection_decodes_utf16be_text_with_corrupted_bom_prefix() {
        let mut bytes = super::CORRUPTED_UTF16_BOM_PREFIX.to_vec();
        for code_unit in
            "Licensed to the Apache Software Foundation\nApache License, Version 2.0".encode_utf16()
        {
            bytes.extend_from_slice(&code_unit.to_be_bytes());
        }

        let (text, kind) = extract_text_for_detection(Path::new("notice.ftl"), &bytes);

        assert_eq!(kind, ExtractedTextKind::Decoded);
        assert!(text.contains("Apache Software Foundation"), "{text}");
        assert!(text.contains("Apache License, Version 2.0"), "{text}");
    }

    #[test]
    fn test_classify_file_info_treats_small_valid_json_literals_as_text() {
        let classification = classify_file_info(Path::new("true.json"), b"true");

        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert_eq!(classification.mime_type, "application/json");
        assert_eq!(classification.file_type, "JSON text data");
    }

    #[test]
    fn test_classify_file_info_treats_json_wrapped_invalid_utf8_sequences_as_text() {
        let classification = classify_file_info(
            Path::new("wrapped.json"),
            &[0x5B, 0x22, 0xE6, 0x97, 0xA5, 0xD1, 0x88, 0xFA, 0x22, 0x5D],
        );

        assert!(!classification.is_binary);
        assert!(classification.is_text);
        assert_eq!(classification.mime_type, "text/plain");
        assert_eq!(classification.file_type, "text, with no line terminators");
    }

    #[test]
    fn test_classify_file_info_keeps_lone_ff_json_byte_binary() {
        let classification =
            classify_file_info(Path::new("lone-ff.json"), &[0x5B, 0x22, 0xFF, 0x22, 0x5D]);

        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert_eq!(classification.mime_type, "application/octet-stream");
        assert_eq!(classification.file_type, "data");
    }

    #[test]
    fn test_classify_file_info_keeps_nul_heavy_crash_json_binary() {
        let classification = classify_file_info(
            Path::new("crash.json"),
            &[
                0xFE, 0x90, 0x00, 0x00, 0x00, 0x93, 0x5B, 0x5B, 0x32, 0x38, 0x36,
            ],
        );

        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert_eq!(classification.mime_type, "application/octet-stream");
    }

    #[test]
    fn test_classify_file_info_treats_dockerfile_as_source() {
        let classification = classify_file_info(Path::new("Dockerfile"), b"FROM scratch\n");

        assert_eq!(
            classification.programming_language.as_deref(),
            Some("Dockerfile")
        );
        assert!(classification.is_source);
        assert!(!classification.is_script);
        assert_eq!(
            classification.file_type,
            "Dockerfile source, UTF-8 Unicode text"
        );
    }

    #[test]
    fn test_classify_file_info_treats_makefile_as_text_not_source() {
        let classification = classify_file_info(Path::new("Makefile"), b"all:\n\techo hi\n");

        assert_eq!(classification.programming_language, None);
        assert!(classification.is_text);
        assert!(!classification.is_source);
        assert!(!classification.is_script);
        assert_eq!(classification.file_type, "UTF-8 Unicode text");
    }

    #[test]
    fn test_classify_file_info_marks_supported_package_archives() {
        let zip_bytes = b"PK\x03\x04\x14\x00\x00\x00";

        let egg = classify_file_info(Path::new("demo.egg"), zip_bytes);
        let nupkg = classify_file_info(Path::new("demo.nupkg"), zip_bytes);

        assert!(egg.is_archive);
        assert_eq!(egg.mime_type, "application/zip");
        assert_eq!(egg.file_type, "Zip archive data");
        assert!(nupkg.is_archive);
        assert_eq!(nupkg.mime_type, "application/zip");
        assert_eq!(nupkg.file_type, "Zip archive data");
    }

    #[test]
    fn test_classify_file_info_marks_png_as_binary_media() {
        let png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\x0dIHDR";

        let classification = classify_file_info(Path::new("logo.png"), png_bytes);

        assert_eq!(classification.mime_type, "image/png");
        assert_eq!(classification.file_type, "PNG image data");
        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert!(classification.is_media);
        assert!(!classification.is_archive);
        assert!(!classification.is_source);
    }

    #[test]
    fn test_classify_file_info_marks_pdf_as_binary_document() {
        let pdf_bytes = b"%PDF-1.7\n1 0 obj\n<< /Type /Catalog >>\n";

        let classification = classify_file_info(Path::new("report.pdf"), pdf_bytes);

        assert_eq!(classification.mime_type, "application/pdf");
        assert_eq!(classification.file_type, "PDF document");
        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert!(!classification.is_archive);
        assert!(!classification.is_media);
    }

    #[test]
    fn test_classify_file_info_marks_binary_blobs_as_binary() {
        let classification =
            classify_file_info(Path::new("blob.bin"), &[0, 159, 146, 150, 0, 1, 2, 3, 4, 5]);

        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert!(!classification.is_source);
        assert_eq!(classification.programming_language, None);
    }

    #[test]
    fn test_classify_file_info_treats_yaml_as_text_not_source() {
        let classification = classify_file_info(Path::new("config.yaml"), b"key: value\n");

        assert_eq!(classification.programming_language, None);
        assert!(classification.is_text);
        assert!(!classification.is_source);
        assert_eq!(classification.file_type, "YAML text data");
    }

    #[test]
    fn test_classify_file_info_classifies_common_build_manifests() {
        let gradle = classify_file_info(Path::new("build.gradle"), b"plugins { id 'java' }\n");
        let flake = classify_file_info(Path::new("flake.nix"), b"{ inputs, ... }: {}\n");
        let cmake = classify_file_info(
            Path::new("toolchain.cmake"),
            b"set(CMAKE_CXX_STANDARD 20)\n",
        );
        let gitmodules = classify_file_info(
            Path::new(".gitmodules"),
            b"[submodule \"demo\"]\n\tpath = vendor/demo\n",
        );

        assert_eq!(gradle.programming_language.as_deref(), Some("Groovy"));
        assert!(gradle.is_source);
        assert_eq!(gradle.mime_type, "text/plain");
        assert_eq!(gradle.file_type, "Groovy source, UTF-8 Unicode text");

        assert_eq!(flake.programming_language.as_deref(), Some("Nix"));
        assert!(flake.is_source);
        assert_eq!(flake.mime_type, "text/plain");
        assert_eq!(flake.file_type, "Nix source, UTF-8 Unicode text");

        assert_eq!(cmake.programming_language.as_deref(), Some("CMake"));
        assert!(cmake.is_source);
        assert_eq!(cmake.file_type, "CMake source, UTF-8 Unicode text");

        assert_eq!(gitmodules.programming_language, None);
        assert!(gitmodules.is_text);
        assert!(!gitmodules.is_source);
        assert_eq!(gitmodules.file_type, "Git configuration text");
    }

    #[test]
    fn test_classify_file_info_labels_cpp_headers_and_ipp_separately() {
        let header = classify_file_info(
            Path::new("include/demo.hpp"),
            b"#pragma once\nclass Demo {};\n",
        );
        let ipp = classify_file_info(
            Path::new("include/detail/demo.ipp"),
            b"template <class T> void parse() {}\n",
        );

        assert_eq!(header.programming_language.as_deref(), Some("C++"));
        assert!(header.is_source);
        assert!(!header.is_script);
        assert_eq!(header.file_type, "C++ source, UTF-8 Unicode text");

        assert_eq!(ipp.programming_language, None);
        assert!(!ipp.is_source);
        assert!(!ipp.is_script);
        assert_eq!(ipp.file_type, "UTF-8 Unicode text");
    }

    #[test]
    fn test_classify_file_info_preserves_specific_shell_family_labels() {
        let bash = classify_file_info(Path::new("bin/run"), b"#!/usr/bin/env bash\necho hi\n");

        assert_eq!(bash.programming_language.as_deref(), Some("Bash"));
        assert!(bash.is_script);
        assert_eq!(bash.file_type, "bash script, UTF-8 Unicode text executable");
    }

    #[test]
    fn test_classify_file_info_marks_jamfile_as_source() {
        let jamfile = classify_file_info(Path::new("Jamfile"), b"lib boost_json ;\n");

        assert_eq!(jamfile.programming_language.as_deref(), Some("Jamfile"));
        assert!(jamfile.is_source);
        assert!(!jamfile.is_script);
        assert_eq!(jamfile.file_type, "Jamfile source, UTF-8 Unicode text");
    }

    #[test]
    fn test_classify_file_info_labels_javascript_shebang_scripts() {
        let classification = classify_file_info(
            Path::new("bin/run"),
            b"#!/usr/bin/env node\nconsole.log('hello');\n",
        );

        assert_eq!(
            classification.programming_language.as_deref(),
            Some("JavaScript")
        );
        assert!(classification.is_script);
        assert_eq!(
            classification.file_type,
            "javascript script, UTF-8 Unicode text executable"
        );
    }

    #[test]
    fn test_classify_file_info_uses_non_utf8_text_labels_for_latin1_scripts() {
        let classification = classify_file_info(
            Path::new("script.py"),
            b"# coding: latin-1\nprint(\"caf\xe9\")\n",
        );

        assert_eq!(
            classification.programming_language.as_deref(),
            Some("Python")
        );
        assert!(classification.is_script);
        assert_eq!(classification.file_type, "python script, text executable");
    }

    #[test]
    fn test_classify_file_info_treats_textual_tga_as_media() {
        let classification = classify_file_info(Path::new("texture.tga"), b"not really a tga\n");

        assert!(classification.is_media);
        assert!(classification.is_text);
        assert!(!classification.is_binary);
    }

    #[test]
    fn test_classify_file_info_keeps_binaryish_source_extension_out_of_text_path() {
        let classification =
            classify_file_info(Path::new("main.ts"), &[0x80, 0x81, 0x82, 0x83, 0x84, 0x85]);

        assert!(classification.is_binary);
        assert!(!classification.is_text);
        assert!(!classification.is_source);
        assert_eq!(classification.programming_language, None);
    }

    #[test]
    fn test_extract_text_for_detection_skips_unsupported_image_formats() {
        let gif_bytes = b"GIF89a\x01\x00\x01\x00\x80\x00\x00\x00\x00\x00\xff\xff\xff,\x00\x00\x00\x00\x01\x00\x01\x00\x00\x02\x02D\x01\x00;";

        let (text, kind) = extract_text_for_detection(Path::new("tiny.gif"), gif_bytes);

        assert!(text.is_empty());
        assert_eq!(kind, ExtractedTextKind::None);
    }

    #[test]
    fn test_classify_file_info_preserves_language_detection_precedence_matrix() {
        let cases = [
            (
                Path::new("bin/run"),
                b"#!/usr/bin/env node\nconsole.log('hello');\n".as_slice(),
                Some("JavaScript"),
                true,
                true,
            ),
            (
                Path::new("Dockerfile"),
                b"FROM scratch\n".as_slice(),
                Some("Dockerfile"),
                true,
                false,
            ),
            (
                Path::new("package.json"),
                br#"{"name":"demo"}"#.as_slice(),
                None,
                false,
                false,
            ),
            (
                Path::new("config.yaml"),
                b"key: value\n".as_slice(),
                None,
                false,
                false,
            ),
            (
                Path::new("Makefile"),
                b"all:\n\techo hi\n".as_slice(),
                None,
                false,
                false,
            ),
        ];

        for (path, bytes, expected_language, expected_is_source, expected_is_script) in cases {
            let classification = classify_file_info(path, bytes);

            assert_eq!(
                classification.programming_language.as_deref(),
                expected_language,
                "unexpected language for {}",
                path.display()
            );
            assert_eq!(
                classification.is_source,
                expected_is_source,
                "unexpected is_source for {}",
                path.display()
            );
            assert_eq!(
                classification.is_script,
                expected_is_script,
                "unexpected is_script for {}",
                path.display()
            );
        }
    }
}