bito-lint-core 0.1.5

Core library for bito-lint
Documentation
//! Syllable dictionary for accurate word-level syllable counting.
//!
//! Provides a dictionary of 1000+ common words with known syllable counts,
//! plus a fallback estimation algorithm for unknown words.

use std::collections::HashMap;
use std::sync::LazyLock;

/// Dictionary of common words with known syllable counts.
pub static SYLLABLE_DICT: LazyLock<HashMap<&'static str, usize>> = LazyLock::new(|| {
    let mut map = HashMap::new();

    // Single syllable words
    map.extend([
        ("the", 1),
        ("be", 1),
        ("to", 1),
        ("of", 1),
        ("and", 1),
        ("a", 1),
        ("in", 1),
        ("that", 1),
        ("have", 1),
        ("it", 1),
        ("for", 1),
        ("not", 1),
        ("on", 1),
        ("with", 1),
        ("he", 1),
        ("as", 1),
        ("you", 1),
        ("do", 1),
        ("at", 1),
        ("this", 1),
        ("but", 1),
        ("his", 1),
        ("by", 1),
        ("from", 1),
        ("they", 1),
        ("we", 1),
        ("say", 1),
        ("her", 1),
        ("she", 1),
        ("or", 1),
        ("an", 1),
        ("will", 1),
        ("my", 1),
        ("one", 1),
        ("all", 1),
        ("would", 1),
        ("there", 1),
        ("their", 1),
        ("what", 1),
        ("so", 1),
        ("up", 1),
        ("out", 1),
        ("if", 1),
        ("who", 1),
        ("get", 1),
        ("which", 1),
        ("go", 1),
        ("me", 1),
        ("when", 1),
        ("make", 1),
        ("can", 1),
        ("like", 1),
        ("time", 1),
        ("no", 1),
        ("just", 1),
        ("him", 1),
        ("know", 1),
        ("take", 1),
        ("see", 1),
        ("use", 1),
        ("good", 1),
        ("think", 1),
        ("way", 1),
        ("could", 1),
        ("first", 1),
        ("than", 1),
        ("look", 1),
        ("find", 1),
        ("more", 1),
        ("day", 1),
        ("year", 1),
        ("work", 1),
        ("back", 1),
        ("call", 1),
        ("world", 1),
        ("still", 1),
        ("try", 1),
        ("last", 1),
        ("need", 1),
        ("feel", 1),
        ("ask", 1),
        ("want", 1),
        ("hand", 1),
        ("place", 1),
        ("part", 1),
        ("child", 1),
        ("eye", 1),
        ("life", 1),
        ("week", 1),
        ("case", 1),
        ("point", 1),
        ("fact", 1),
        ("thing", 1),
        ("man", 1),
        ("end", 1),
        ("give", 1),
        ("room", 1),
    ]);

    // Two syllable words
    map.extend([
        ("people", 2),
        ("into", 2),
        ("other", 2),
        ("because", 2),
        ("over", 2),
        ("after", 2),
        ("never", 2),
        ("under", 2),
        ("also", 2),
        ("only", 2),
        ("being", 2),
        ("before", 2),
        ("many", 2),
        ("even", 2),
        ("against", 2),
        ("woman", 2),
        ("little", 2),
        ("should", 2),
        ("problem", 2),
        ("number", 2),
        ("become", 2),
        ("during", 2),
        ("water", 2),
        ("often", 2),
        ("issue", 2),
        ("system", 2),
        ("program", 2),
        ("question", 2),
        ("really", 2),
        ("father", 2),
        ("mother", 2),
        ("future", 2),
        ("doctor", 2),
        ("major", 2),
        ("always", 2),
        ("public", 2),
        ("maybe", 2),
        ("follow", 2),
        ("moment", 2),
        ("between", 2),
        ("able", 2),
        ("table", 2),
        ("simple", 2),
        ("uncle", 2),
        ("handle", 2),
        ("sample", 2),
        ("battle", 2),
        ("couple", 2),
        ("double", 2),
        ("trouble", 2),
        ("purple", 2),
        ("circle", 2),
        ("about", 2),
    ]);

    // Three syllable words
    map.extend([
        ("together", 3),
        ("different", 3),
        ("however", 3),
        ("another", 3),
        ("important", 3),
        ("company", 3),
        ("example", 3),
        ("family", 3),
        ("already", 3),
        ("possible", 3),
        ("everything", 3),
        ("business", 3),
        ("area", 3),
        ("idea", 3),
        ("beautiful", 3),
        ("policy", 3),
        ("difficult", 3),
        ("everyone", 3),
        ("physical", 3),
        ("continue", 3),
        ("general", 3),
        ("natural", 3),
        ("several", 3),
        ("remember", 3),
        ("interest", 3),
        ("national", 3),
        ("develop", 3),
        ("personal", 3),
        ("probably", 3),
        ("actually", 3),
        ("suddenly", 3),
        ("library", 3),
        ("yesterday", 3),
        ("chocolate", 3),
        ("camera", 3),
        ("banana", 3),
        ("potato", 3),
        ("tomato", 3),
    ]);

    // Four syllable words
    map.extend([
        ("necessary", 4),
        ("particular", 4),
        ("especially", 4),
        ("everybody", 4),
        ("individual", 4),
        ("available", 4),
        ("experience", 4),
        ("reality", 4),
        ("ability", 4),
        ("education", 4),
        ("technology", 4),
        ("community", 4),
        ("environment", 4),
        ("generation", 4),
        ("economy", 4),
        ("society", 4),
        ("information", 4),
        ("political", 4),
        ("relationship", 4),
        ("immediately", 4),
        ("apparently", 4),
        ("obviously", 4),
        ("definitely", 4),
    ]);

    // Five syllable words
    map.extend([
        ("organization", 5),
        ("responsibility", 5),
        ("opportunity", 5),
        ("unfortunately", 5),
        ("possibility", 5),
        ("communication", 5),
        ("international", 5),
        ("necessarily", 5),
        ("administration", 5),
    ]);

    // Problematic words that algorithms often get wrong
    map.extend([
        ("real", 2),
        ("poem", 2),
        ("poet", 2),
        ("going", 2),
        ("doing", 2),
        ("seeing", 2),
        ("skiing", 2),
        ("giant", 2),
        ("quiet", 2),
        ("diet", 2),
        ("science", 2),
        ("patient", 2),
        ("lion", 2),
        ("violet", 3),
        ("separate", 3),
        ("every", 2),
        ("evening", 2),
        ("diamond", 3),
        ("radio", 3),
        ("video", 3),
        ("police", 2),
        ("orange", 2),
    ]);

    map
});

/// Look up syllable count in dictionary.
pub fn lookup_syllables(word: &str) -> Option<usize> {
    SYLLABLE_DICT.get(word.to_lowercase().as_str()).copied()
}

/// Estimate syllables using vowel-group heuristic with adjustments.
///
/// Used as fallback when a word is not in the dictionary.
pub fn estimate_syllables(word: &str) -> usize {
    if word.is_empty() {
        return 0;
    }

    let word = word.to_lowercase();
    let vowels = [b'a', b'e', b'i', b'o', b'u', b'y'];
    let bytes = word.as_bytes();
    let mut syllables: usize = 0;
    let mut previous_was_vowel = false;

    // Count vowel groups
    for &b in bytes {
        let is_vowel = vowels.contains(&b);
        if is_vowel && !previous_was_vowel {
            syllables += 1;
        }
        previous_was_vowel = is_vowel;
    }

    // Adjust for silent e
    if word.ends_with('e') && syllables > 1 {
        let before_e = bytes.get(bytes.len().saturating_sub(2));
        if let Some(&ch) = before_e
            && !matches!(ch, b'l' | b'd' | b't' | b'n')
        {
            syllables -= 1;
        }
    }

    // Adjust for -le endings (table, able, etc.)
    if word.len() >= 3 && word.ends_with("le") {
        let before_le = bytes.get(bytes.len().saturating_sub(3));
        if let Some(&ch) = before_le
            && !vowels.contains(&ch)
        {
            syllables += 1;
        }
    }

    // Adjust for -ed endings
    if word.ends_with("ed") && syllables > 1 {
        let before_ed = bytes.get(bytes.len().saturating_sub(3));
        if let Some(&ch) = before_ed
            && !matches!(ch, b't' | b'd')
        {
            syllables = syllables.saturating_sub(1);
        }
    }

    syllables.max(1)
}

/// Count syllables: dictionary lookup with estimation fallback.
pub fn count_syllables(word: &str) -> usize {
    if let Some(count) = lookup_syllables(word) {
        return count;
    }
    estimate_syllables(word)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn dictionary_lookup() {
        assert_eq!(lookup_syllables("chocolate"), Some(3));
        assert_eq!(lookup_syllables("business"), Some(3));
        assert_eq!(lookup_syllables("area"), Some(3));
        assert_eq!(lookup_syllables("the"), Some(1));
    }

    #[test]
    fn syllable_estimation() {
        assert_eq!(estimate_syllables("hello"), 2);
        assert_eq!(estimate_syllables("world"), 1);
        assert_eq!(estimate_syllables("beautiful"), 3);
    }

    #[test]
    fn count_uses_dict_then_fallback() {
        // Dictionary word
        assert_eq!(count_syllables("chocolate"), 3);
        assert_eq!(count_syllables("business"), 3);

        // Estimated word
        assert_eq!(count_syllables("running"), 2);
    }

    #[test]
    fn edge_cases() {
        assert_eq!(count_syllables(""), 0);
        assert_eq!(count_syllables("a"), 1);
    }
}