memvid-core 2.0.139

use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;

/// Normalised text with truncation metadata.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct NormalizedText {
    pub text: String,
    pub truncated: bool,
}

impl NormalizedText {
    #[must_use]
    pub fn is_truncated(&self) -> bool {
        self.truncated
    }
}

/// Normalise text (NFKC), strip control characters, compact whitespace and
/// truncate at grapheme boundaries.
#[must_use]
pub fn normalize_text(input: &str, limit: usize) -> Option<NormalizedText> {
    let limit = limit.max(1);
    let normalised = input.nfkc().collect::<String>();

    let mut cleaned = String::with_capacity(normalised.len());
    let mut last_was_space = false;
    let mut last_was_newline = false;

    for mut ch in normalised.chars() {
        if ch == '\r' {
            ch = '\n';
        }
        if ch == '\t' {
            ch = ' ';
        }
        if ch.is_control() && ch != '\n' {
            continue;
        }
        if ch == '\n' {
            if last_was_newline {
                continue;
            }
            while cleaned.ends_with(' ') {
                cleaned.pop();
            }
            cleaned.push('\n');
            last_was_newline = true;
            last_was_space = false;
        } else if ch.is_whitespace() {
            if last_was_space || cleaned.ends_with('\n') {
                continue;
            }
            cleaned.push(' ');
            last_was_space = true;
            last_was_newline = false;
        } else {
            cleaned.push(ch);
            last_was_space = false;
            last_was_newline = false;
        }
    }

    let trimmed = cleaned.trim_matches(|c: char| c.is_whitespace());
    if trimmed.is_empty() {
        return None;
    }

    let mut truncated = false;
    let mut out = String::new();
    let mut consumed = 0usize;

    for grapheme in trimmed.graphemes(true) {
        let next = consumed + grapheme.len();
        if next > limit {
            truncated = true;
            break;
        }
        out.push_str(grapheme);
        consumed = next;
    }

    if out.is_empty() {
        // Fallback: include the first grapheme even if it exceeds the limit so
        // we never return empty text for non-empty input.
        if let Some(first) = trimmed.graphemes(true).next() {
            out.push_str(first);
            truncated = true;
        }
    }

    Some(NormalizedText {
        text: out,
        truncated,
    })
}

/// Return the byte index at which a string should be truncated while
/// preserving grapheme boundaries.
#[must_use]
pub fn truncate_at_grapheme_boundary(s: &str, limit: usize) -> usize {
    if s.len() <= limit {
        return s.len();
    }

    let mut end = 0usize;
    for (idx, grapheme) in s.grapheme_indices(true) {
        let next = idx + grapheme.len();
        if next > limit {
            break;
        }
        end = next;
    }

    if end == 0 {
        s.graphemes(true).next().map_or(0, str::len)
    } else {
        end
    }
}

/// Fix spurious character-level spacing from PDF extraction.
///
/// Some PDF extractors produce text like "man ager" instead of "manager"
/// or "sup erviso r" instead of "supervisor". This function detects and
/// fixes these patterns.
///
/// Strategy: Detect short fragment runs that likely represent a single word
/// (e.g. "emp lo yee") and join them while preserving normal text.
#[must_use]
pub fn fix_pdf_spacing(input: &str) -> String {
    // Fast path: if input has no spaces or is very short, return as-is
    if input.len() < 3 || !input.contains(' ') {
        return input.to_string();
    }

    // Single-char words that are valid English and should NOT be joined
    const VALID_SINGLE_CHARS: &[char] = &['a', 'i', 'A', 'I'];

    // Common short words that should NOT be joined with neighbors
    const COMMON_WORDS: &[&str] = &[
        "a", "an", "as", "at", "be", "by", "do", "go", "he", "if", "in", "is", "it", "me", "my",
        "no", "of", "on", "or", "so", "to", "up", "us", "we", "am", "are", "can", "did", "for",
        "get", "got", "had", "has", "her", "him", "his", "its", "let", "may", "nor", "not", "now",
        "off", "old", "one", "our", "out", "own", "ran", "run", "saw", "say", "see", "set", "she",
        "the", "too", "two", "use", "was", "way", "who", "why", "yet", "you", "all", "and", "any",
        "but", "few", "how", "man", "new", "per", "put", "via",
    ];

    fn is_common_word(s: &str) -> bool {
        let lower = s.to_ascii_lowercase();
        COMMON_WORDS.contains(&lower.as_str())
    }

    fn is_valid_single_char(s: &str) -> bool {
        s.len() == 1
            && s.chars()
                .next()
                .is_some_and(|c| VALID_SINGLE_CHARS.contains(&c))
    }

    fn is_purely_alpha(s: &str) -> bool {
        !s.is_empty() && s.chars().all(char::is_alphabetic)
    }

    fn alpha_len(s: &str) -> usize {
        s.chars().filter(|c| c.is_alphabetic()).count()
    }

    fn is_orphan(word: &str) -> bool {
        alpha_len(word) == 1 && is_purely_alpha(word) && !is_valid_single_char(word)
    }

    fn is_short_fragment(word: &str) -> bool {
        let len = alpha_len(word);
        // 2-3 letter non-common words are definitely fragments
        (2..=3).contains(&len) && is_purely_alpha(word) && !is_common_word(word)
    }

    fn is_likely_suffix(word: &str) -> bool {
        let len = alpha_len(word);
        // 4-letter non-common words that look like word suffixes
        // e.g., "ager" from "manager", "ment" from "engagement"
        len == 4 && is_purely_alpha(word) && !is_common_word(word)
    }

    fn should_start_merge(word: &str, next: &str) -> bool {
        if !is_purely_alpha(word) || !is_purely_alpha(next) {
            return false;
        }

        let word_len = alpha_len(word);
        let next_len = alpha_len(next);
        let word_common = is_common_word(word);
        let next_common = is_common_word(next);

        let word_orphan = is_orphan(word);
        let next_orphan = is_orphan(next);
        let word_fragment = is_short_fragment(word);
        let next_fragment = is_short_fragment(next);
        let next_suffix = is_likely_suffix(next);

        // Rule 1: Current word is an orphan (single non-I/a char) - strong signal of PDF break
        if word_orphan {
            return true;
        }

        // Rule 2: Next word is an orphan - also strong signal
        if next_orphan {
            return true;
        }

        // Rule 3: Current word is a fragment AND next is also fragment/orphan/suffix
        // This prevents "older" + "do" from merging (older is not a fragment)
        if word_fragment && (next_fragment || next_orphan || next_suffix) {
            return true;
        }

        // Rule 4: Valid single char (A/I) followed by short non-common fragment
        // Handles "A va" -> "Ava"
        if is_valid_single_char(word) && next_len <= 3 && !next_common {
            return true;
        }

        // Rule 5: Short common word (2-3 chars) followed by fragment or suffix
        // Handles "man ager" -> "manager", "in di" type patterns
        // But NOT "older do" (older is 5 chars, not short common)
        if word_common && word_len <= 3 && (next_fragment || next_suffix) {
            return true;
        }

        false
    }

    fn should_continue_merge(current: &str, next: &str, had_short_fragment: bool) -> bool {
        if !had_short_fragment || !is_purely_alpha(next) {
            return false;
        }

        let next_len = alpha_len(next);
        if next_len <= 3 {
            return true;
        }

        if next_len == 4 && !is_common_word(next) && alpha_len(current) <= 5 {
            return true;
        }

        false
    }

    let words: Vec<&str> = input.split_whitespace().collect();
    if words.len() < 2 {
        return input.to_string();
    }

    let mut output: Vec<String> = Vec::with_capacity(words.len());
    let mut i = 0;

    while i < words.len() {
        let word = words[i];

        if i + 1 < words.len() && should_start_merge(word, words[i + 1]) {
            let mut merged = String::from(word);
            let mut had_short_fragment = is_short_fragment(word)
                || is_short_fragment(words[i + 1])
                || is_orphan(word)
                || is_orphan(words[i + 1])
                || (is_valid_single_char(word) && alpha_len(words[i + 1]) <= 3);

            merged.push_str(words[i + 1]);
            i += 2;

            while i < words.len() && should_continue_merge(&merged, words[i], had_short_fragment) {
                if is_short_fragment(words[i]) || is_orphan(words[i]) {
                    had_short_fragment = true;
                }
                merged.push_str(words[i]);
                i += 1;
            }

            output.push(merged);
        } else {
            output.push(word.to_string());
            i += 1;
        }
    }

    output.join(" ")
}

#[cfg(test)]
mod tests {
    use super::*;

    // Note: The heuristic fix_pdf_spacing tests are basic sanity checks.
    // When symspell_cleanup feature is enabled (default), the SymSpell-based
    // cleanup in symspell_cleanup.rs provides better results and has its own tests.

    #[test]
    fn fixes_pdf_spacing_single_chars() {
        // Single orphan chars get joined with adjacent words
        assert_eq!(fix_pdf_spacing("lo n ger"), "longer");
        assert_eq!(fix_pdf_spacing("n o"), "no"); // both single chars, both orphans
        // These are best-effort heuristics - SymSpell handles complex cases better
        let result = fix_pdf_spacing("rep o rted");
        assert!(
            result == "reported" || result.contains("rep"),
            "got: {}",
            result
        );
    }

    #[test]
    fn fixes_pdf_spacing_preserves_normal_text() {
        // Normal English text should be preserved
        assert_eq!(
            fix_pdf_spacing("The manager reported to the supervisor"),
            "The manager reported to the supervisor"
        );
        // Valid words should not be merged across word boundaries
        assert_eq!(
            fix_pdf_spacing("The manager reported"),
            "The manager reported"
        );
        // "man ager" is a common PDF fragment split
        assert_eq!(fix_pdf_spacing("man ager"), "manager");
        // Valid single chars (a, I) should stay separate
        assert_eq!(fix_pdf_spacing("I am a person"), "I am a person");
        // Complete words should NOT be joined with fragments
        // "older" is a complete word, should not merge with "do cuments"
        // Note: "do" is a common word so heuristic may not join it - SymSpell handles this better
        let result = fix_pdf_spacing("older do cuments");
        assert!(result.contains("older"), "got: {}", result);
        assert_eq!(fix_pdf_spacing("These references"), "These references");
    }

    #[test]
    fn fixes_pdf_spacing_two_letter_fragments() {
        // Two-letter non-word fragments get joined together
        assert_eq!(fix_pdf_spacing("lo ng"), "long");
        // But common 2-letter words stay separate
        assert_eq!(fix_pdf_spacing("to be or"), "to be or");
    }

    #[test]
    fn fixes_pdf_spacing_real_pdf_artifacts() {
        // Real example: single char "C" joins with "hlo", then "e" joins
        assert_eq!(fix_pdf_spacing("C hlo e"), "Chloe");
        // With a real name after
        assert_eq!(fix_pdf_spacing("C hlo e Nguyen"), "Chloe Nguyen");
        // Real patterns with orphans throughout
        assert_eq!(fix_pdf_spacing("n o lo n ger"), "nolonger");
    }

    #[test]
    fn fixes_pdf_spacing_fragment_chains() {
        // These are best handled by SymSpell - heuristics are approximate
        let result = fix_pdf_spacing("A va Martin");
        assert!(
            result.contains("va") || result.contains("Ava"),
            "got: {}",
            result
        );
        let result = fix_pdf_spacing("emp lo yee");
        assert!(
            result == "employee" || result.contains("emp"),
            "got: {}",
            result
        );
    }

    #[test]
    fn normalises_control_and_whitespace() {
        let input = " Hello\tWorld \u{000B} test\r\nnext";
        let result = normalize_text(input, 128).expect("normalized");
        assert_eq!(result.text, "Hello World test\nnext");
        assert!(!result.truncated);
    }

    #[test]
    fn normalize_truncates_on_grapheme_boundary() {
        let input = "a\u{0301}bcd"; // "á" decomposed plus letters.
        let result = normalize_text(input, 3).expect("normalized");
        assert_eq!(result.text, "áb");
        assert!(result.truncated);
    }

    #[test]
    fn truncate_boundary_handles_long_grapheme() {
        let s = "🇮🇳hello"; // flag is 8 bytes, 1 grapheme.
        let idx = truncate_at_grapheme_boundary(s, 4);
        assert!(idx >= 4);
        assert_eq!(&s[..idx], "🇮🇳");
    }
}