lling-llang 0.1.0

//! Subword lexicon builder for ASR with BPE tokenization.
//!
//! This module extends the standard lexicon with subword tokenization support,
//! enabling handling of out-of-vocabulary words and morphologically rich languages.
//!
//! # Subword Boundary Markers
//!
//! Three marking styles are supported (based on Smit et al. 2017):
//!
//! - **LeftMarked**: `+word` - subword continues from the left (word fragment)
//! - **RightMarked**: `word+` - subword continues to the right
//! - **BoundaryTag**: `<w>word` - explicit word boundary before word
//!
//! # Example
//!
//! ```ignore
//! use lling_llang::asr::SubwordLexiconBuilder;
//! use lling_llang::semiring::LogWeight;
//!
//! let mut builder = SubwordLexiconBuilder::<LogWeight>::new(MarkingStyle::LeftMarked);
//!
//! // Add whole word entry
//! builder.add_word("hello", &["HH", "AH", "L", "OW"], LogWeight::one());
//!
//! // Add subword entries (e.g., from BPE)
//! builder.add_subword("hel", &["HH", "AH", "L"], SubwordPosition::Initial);
//! builder.add_subword("lo", &["L", "OW"], SubwordPosition::Final);
//!
//! // Build into cascade
//! let cascade = builder.build_cascade(&ngram)?;
//! ```
//!
//! # References
//!
//! - Smit, P., Virpioja, S., & Kurimo, M. (2017). Improved Subword Modeling
//!   for WFST-Based Speech Recognition. Interspeech.

use std::collections::HashMap;
use std::marker::PhantomData;

use crate::semiring::Semiring;
use crate::wfst::{MutableWfst, VectorWfst};

use super::cascade::LexiconEntry;
use super::context::PhoneId;
use super::ngram::WordId;

/// Subword boundary marking style.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum MarkingStyle {
    /// Left-marked: `+word` means continuation from left (word fragment).
    /// Used for morpheme-initial subwords.
    LeftMarked,
    /// Right-marked: `word+` means continuation to right.
    /// Used for morpheme-final subwords.
    RightMarked,
    /// Boundary tag: `<w>word` marks word boundaries explicitly.
    /// Most explicit but adds more symbols.
    BoundaryTag,
}

/// Position of subword within a word.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SubwordPosition {
    /// Complete word (not a subword).
    WholeWord,
    /// Initial subword (start of word).
    Initial,
    /// Medial subword (middle of word).
    Medial,
    /// Final subword (end of word).
    Final,
}

/// A subword entry in the lexicon.
#[derive(Clone, Debug)]
pub struct SubwordEntry<W: Semiring> {
    /// The subword text (with boundary markers applied).
    pub subword: String,
    /// Subword ID for FST labels.
    pub subword_id: u32,
    /// Pronunciation as sequence of phones.
    pub phones: Vec<PhoneId>,
    /// Position within word.
    pub position: SubwordPosition,
    /// Weight (log probability or cost).
    pub weight: W,
    /// Original unmarked subword text.
    pub raw_subword: String,
}

/// Builder for subword lexicons.
///
/// Creates FST-based lexicons that map subword sequences to phone sequences,
/// with proper boundary marking for word reconstruction.
pub struct SubwordLexiconBuilder<W: Semiring> {
    /// Boundary marking style.
    marking_style: MarkingStyle,

    /// Subword vocabulary: raw text -> ID.
    subword_vocab: HashMap<String, u32>,

    /// Reverse vocabulary: ID -> marked text.
    reverse_vocab: Vec<String>,

    /// Subword entries.
    entries: Vec<SubwordEntry<W>>,

    /// Word-to-subword mappings for decomposition.
    word_decompositions: HashMap<WordId, Vec<u32>>,

    /// Phone vocabulary: phone string -> ID.
    phone_vocab: HashMap<String, PhoneId>,

    /// Reverse phone vocab: ID -> string.
    reverse_phone_vocab: Vec<String>,

    /// Next available subword ID.
    next_subword_id: u32,

    /// Weight marker.
    _weight: PhantomData<W>,
}

impl<W: Semiring + Clone> SubwordLexiconBuilder<W> {
    /// Create a new subword lexicon builder.
    pub fn new(marking_style: MarkingStyle) -> Self {
        Self {
            marking_style,
            subword_vocab: HashMap::new(),
            reverse_vocab: Vec::new(),
            entries: Vec::new(),
            word_decompositions: HashMap::new(),
            phone_vocab: HashMap::new(),
            reverse_phone_vocab: Vec::new(),
            next_subword_id: 0,
            _weight: PhantomData,
        }
    }

    /// Get the marking style.
    pub fn marking_style(&self) -> MarkingStyle {
        self.marking_style
    }

    /// Get subword vocabulary size.
    pub fn vocab_size(&self) -> usize {
        self.subword_vocab.len()
    }

    /// Get phone vocabulary size.
    pub fn phone_vocab_size(&self) -> usize {
        self.phone_vocab.len()
    }

    /// Add or get phone ID.
    fn intern_phone(&mut self, phone: &str) -> PhoneId {
        if let Some(&id) = self.phone_vocab.get(phone) {
            id
        } else {
            let id = self.reverse_phone_vocab.len() as PhoneId;
            self.phone_vocab.insert(phone.to_string(), id);
            self.reverse_phone_vocab.push(phone.to_string());
            id
        }
    }

    /// Apply boundary marking to a subword.
    fn apply_marking(&self, subword: &str, position: SubwordPosition) -> String {
        match (self.marking_style, position) {
            // Whole words get no marking
            (_, SubwordPosition::WholeWord) => subword.to_string(),

            // Left-marked: prefix with + for continuations
            (MarkingStyle::LeftMarked, SubwordPosition::Initial) => subword.to_string(),
            (MarkingStyle::LeftMarked, SubwordPosition::Medial) => format!("+{}", subword),
            (MarkingStyle::LeftMarked, SubwordPosition::Final) => format!("+{}", subword),

            // Right-marked: suffix with + for continuations
            (MarkingStyle::RightMarked, SubwordPosition::Initial) => format!("{}+", subword),
            (MarkingStyle::RightMarked, SubwordPosition::Medial) => format!("{}+", subword),
            (MarkingStyle::RightMarked, SubwordPosition::Final) => subword.to_string(),

            // Boundary tag: add <w> before word-initial subwords
            (MarkingStyle::BoundaryTag, SubwordPosition::Initial) => format!("<w>{}", subword),
            (MarkingStyle::BoundaryTag, SubwordPosition::Medial) => subword.to_string(),
            (MarkingStyle::BoundaryTag, SubwordPosition::Final) => subword.to_string(),
        }
    }

    /// Check if a marked subword indicates word boundary.
    pub fn is_word_boundary(&self, marked_subword: &str) -> bool {
        match self.marking_style {
            MarkingStyle::LeftMarked => !marked_subword.starts_with('+'),
            MarkingStyle::RightMarked => !marked_subword.ends_with('+'),
            MarkingStyle::BoundaryTag => marked_subword.starts_with("<w>"),
        }
    }

    /// Add a complete word entry.
    pub fn add_word(&mut self, word: &str, phones: &[&str], weight: W) -> u32 {
        self.add_subword(word, phones, SubwordPosition::WholeWord, weight)
    }

    /// Add a subword entry with position marking.
    pub fn add_subword(
        &mut self,
        subword: &str,
        phones: &[&str],
        position: SubwordPosition,
        weight: W,
    ) -> u32 {
        let marked = self.apply_marking(subword, position);

        // Check if already exists
        if let Some(&id) = self.subword_vocab.get(&marked) {
            return id;
        }

        // Assign new ID
        let id = self.next_subword_id;
        self.next_subword_id += 1;

        // Intern phones
        let phone_ids: Vec<PhoneId> = phones.iter().map(|p| self.intern_phone(p)).collect();

        // Store in vocabulary
        self.subword_vocab.insert(marked.clone(), id);
        self.reverse_vocab.push(marked.clone());

        // Create entry
        let entry = SubwordEntry {
            subword: marked,
            subword_id: id,
            phones: phone_ids,
            position,
            weight,
            raw_subword: subword.to_string(),
        };
        self.entries.push(entry);

        id
    }

    /// Register a word decomposition into subwords.
    ///
    /// This is used when the language model operates on subwords but you want
    /// to maintain word-level alignment for evaluation.
    pub fn register_decomposition(&mut self, word_id: WordId, subword_ids: Vec<u32>) {
        self.word_decompositions.insert(word_id, subword_ids);
    }

    /// Get subword ID by marked text.
    pub fn get_subword_id(&self, marked_subword: &str) -> Option<u32> {
        self.subword_vocab.get(marked_subword).copied()
    }

    /// Get marked text by subword ID.
    pub fn get_subword_text(&self, id: u32) -> Option<&str> {
        self.reverse_vocab.get(id as usize).map(|s| s.as_str())
    }

    /// Get phone name by ID.
    pub fn get_phone_name(&self, id: PhoneId) -> Option<&str> {
        self.reverse_phone_vocab
            .get(id as usize)
            .map(|s| s.as_str())
    }

    /// Build the subword lexicon transducer (L).
    ///
    /// Creates an FST mapping subword sequences to phone sequences.
    ///
    /// Input labels: phones
    /// Output labels: subword IDs
    pub fn build_lexicon_fst(&self) -> VectorWfst<PhoneId, W> {
        let mut fst: VectorWfst<PhoneId, W> = VectorWfst::new();

        // Create initial state (also accepting - allows empty input)
        let start = fst.add_state();
        fst.set_start(start);
        fst.set_final(start, W::one());

        // Add each subword entry
        for entry in &self.entries {
            if entry.phones.is_empty() {
                continue;
            }

            let mut current = start;

            // First phone: output the subword label
            // (In a proper L transducer, output would be on first arc)
            let next = fst.add_state();
            fst.add_arc(
                current,
                Some(entry.phones[0]),
                Some(entry.phones[0]),
                next,
                entry.weight.clone(),
            );
            current = next;

            // Middle phones (if any)
            for &phone in entry
                .phones
                .iter()
                .skip(1)
                .take(entry.phones.len().saturating_sub(2))
            {
                let next = fst.add_state();
                fst.add_arc(current, Some(phone), Some(phone), next, W::one());
                current = next;
            }

            // Last phone (if more than one phone): return to start
            if entry.phones.len() > 1 {
                let last_phone = entry.phones[entry.phones.len() - 1];
                fst.add_arc(current, Some(last_phone), Some(last_phone), start, W::one());
            } else {
                // Single-phone subword: create self-loop back to start
                // The arc we added already goes from start to next
                // Add epsilon transition from next back to start
                fst.add_arc(current, None, None, start, W::one());
            }
        }

        fst
    }

    /// Convert entries to standard LexiconEntry format for CascadeBuilder.
    pub fn to_lexicon_entries(&self) -> Vec<LexiconEntry<W>> {
        self.entries
            .iter()
            .map(|e| LexiconEntry {
                word: e.subword_id as WordId,
                phones: e.phones.clone(),
                weight: e.weight.clone(),
                auxiliaries: Vec::new(),
            })
            .collect()
    }

    /// Get all entries.
    pub fn entries(&self) -> &[SubwordEntry<W>] {
        &self.entries
    }

    /// Get word decomposition by word ID.
    pub fn get_decomposition(&self, word_id: WordId) -> Option<&[u32]> {
        self.word_decompositions.get(&word_id).map(|v| v.as_slice())
    }

    /// Reconstruct word from subword sequence.
    ///
    /// Uses boundary markers to determine word boundaries.
    pub fn reconstruct_words(&self, subword_ids: &[u32]) -> Vec<String> {
        let mut words = Vec::new();
        let mut current_word = String::new();

        for &id in subword_ids {
            let Some(marked) = self.get_subword_text(id) else {
                continue;
            };

            // Get the raw subword without markers
            let raw = match self.marking_style {
                MarkingStyle::LeftMarked => {
                    if marked.starts_with('+') {
                        &marked[1..]
                    } else {
                        marked
                    }
                }
                MarkingStyle::RightMarked => {
                    if marked.ends_with('+') {
                        &marked[..marked.len() - 1]
                    } else {
                        marked
                    }
                }
                MarkingStyle::BoundaryTag => {
                    if marked.starts_with("<w>") {
                        &marked[3..]
                    } else {
                        marked
                    }
                }
            };

            // Check for word boundary
            let is_boundary = self.is_word_boundary(marked);

            // For left-marked and boundary-tag: boundary is at the START
            // (finalize previous word before starting new one)
            // For right-marked: boundary is at the END
            // (finalize current word after appending)
            match self.marking_style {
                MarkingStyle::LeftMarked | MarkingStyle::BoundaryTag => {
                    // Boundary at start: finalize before appending
                    if is_boundary && !current_word.is_empty() {
                        words.push(current_word);
                        current_word = String::new();
                    }
                    current_word.push_str(raw);
                }
                MarkingStyle::RightMarked => {
                    // Boundary at end: append then finalize
                    current_word.push_str(raw);
                    if is_boundary && !current_word.is_empty() {
                        words.push(current_word);
                        current_word = String::new();
                    }
                }
            }
        }

        // Push final word if non-empty
        if !current_word.is_empty() {
            words.push(current_word);
        }

        words
    }
}

impl<W: Semiring + Clone> Default for SubwordLexiconBuilder<W> {
    fn default() -> Self {
        Self::new(MarkingStyle::LeftMarked)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::semiring::TropicalWeight;
    use crate::wfst::Wfst;

    #[test]
    fn test_marking_style_left() {
        let builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        assert_eq!(
            builder.apply_marking("word", SubwordPosition::WholeWord),
            "word"
        );
        assert_eq!(
            builder.apply_marking("hel", SubwordPosition::Initial),
            "hel"
        );
        assert_eq!(builder.apply_marking("lo", SubwordPosition::Medial), "+lo");
        assert_eq!(builder.apply_marking("ing", SubwordPosition::Final), "+ing");
    }

    #[test]
    fn test_marking_style_right() {
        let builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::RightMarked);

        assert_eq!(
            builder.apply_marking("word", SubwordPosition::WholeWord),
            "word"
        );
        assert_eq!(
            builder.apply_marking("hel", SubwordPosition::Initial),
            "hel+"
        );
        assert_eq!(builder.apply_marking("lo", SubwordPosition::Medial), "lo+");
        assert_eq!(builder.apply_marking("ing", SubwordPosition::Final), "ing");
    }

    #[test]
    fn test_marking_style_boundary() {
        let builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::BoundaryTag);

        assert_eq!(
            builder.apply_marking("word", SubwordPosition::WholeWord),
            "word"
        );
        assert_eq!(
            builder.apply_marking("hel", SubwordPosition::Initial),
            "<w>hel"
        );
        assert_eq!(builder.apply_marking("lo", SubwordPosition::Medial), "lo");
        assert_eq!(builder.apply_marking("ing", SubwordPosition::Final), "ing");
    }

    #[test]
    fn test_add_word() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        let id = builder.add_word("hello", &["HH", "AH", "L", "OW"], TropicalWeight::one());
        assert_eq!(id, 0);
        assert_eq!(builder.vocab_size(), 1);
        assert_eq!(builder.phone_vocab_size(), 4);

        // Adding same word should return same ID
        let id2 = builder.add_word("hello", &["HH", "AH", "L", "OW"], TropicalWeight::one());
        assert_eq!(id, id2);
    }

    #[test]
    fn test_add_subwords() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        let id1 = builder.add_subword(
            "hel",
            &["HH", "AH", "L"],
            SubwordPosition::Initial,
            TropicalWeight::one(),
        );
        let id2 = builder.add_subword(
            "lo",
            &["L", "OW"],
            SubwordPosition::Final,
            TropicalWeight::one(),
        );

        assert_eq!(id1, 0);
        assert_eq!(id2, 1);
        assert_eq!(builder.vocab_size(), 2);

        // Check marked forms
        assert_eq!(builder.get_subword_text(id1), Some("hel"));
        assert_eq!(builder.get_subword_text(id2), Some("+lo"));
    }

    #[test]
    fn test_is_word_boundary() {
        let left_builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);
        assert!(left_builder.is_word_boundary("hello"));
        assert!(!left_builder.is_word_boundary("+ing"));

        let right_builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::RightMarked);
        assert!(right_builder.is_word_boundary("hello"));
        assert!(!right_builder.is_word_boundary("hel+"));

        let boundary_builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::BoundaryTag);
        assert!(boundary_builder.is_word_boundary("<w>hello"));
        assert!(!boundary_builder.is_word_boundary("ing"));
    }

    #[test]
    fn test_reconstruct_words_left_marked() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        // Add subwords: "hel", "+lo", "world"
        let id1 = builder.add_subword(
            "hel",
            &["HH", "AH", "L"],
            SubwordPosition::Initial,
            TropicalWeight::one(),
        );
        let id2 = builder.add_subword(
            "lo",
            &["L", "OW"],
            SubwordPosition::Final,
            TropicalWeight::one(),
        );
        let id3 = builder.add_word("world", &["W", "ER", "L", "D"], TropicalWeight::one());

        let words = builder.reconstruct_words(&[id1, id2, id3]);
        assert_eq!(words, vec!["hello", "world"]);
    }

    #[test]
    fn test_reconstruct_words_right_marked() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::RightMarked);

        // Add subwords: "hel+", "lo", "world"
        let id1 = builder.add_subword(
            "hel",
            &["HH", "AH", "L"],
            SubwordPosition::Initial,
            TropicalWeight::one(),
        );
        let id2 = builder.add_subword(
            "lo",
            &["L", "OW"],
            SubwordPosition::Final,
            TropicalWeight::one(),
        );
        let id3 = builder.add_word("world", &["W", "ER", "L", "D"], TropicalWeight::one());

        let words = builder.reconstruct_words(&[id1, id2, id3]);
        assert_eq!(words, vec!["hello", "world"]);
    }

    #[test]
    fn test_build_lexicon_fst() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        builder.add_word("hi", &["HH", "AY"], TropicalWeight::one());
        builder.add_word("bye", &["B", "AY"], TropicalWeight::one());

        let fst = builder.build_lexicon_fst();

        // Should have states: start + 2 states per word = 5 states
        assert!(fst.num_states() >= 3);
        // Verify start state is valid
        assert!(fst.is_valid_state(fst.start()));
    }

    #[test]
    fn test_to_lexicon_entries() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        builder.add_word("hello", &["HH", "AH", "L", "OW"], TropicalWeight::new(1.5));

        let entries = builder.to_lexicon_entries();
        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].word, 0);
        assert_eq!(entries[0].phones.len(), 4);
        assert!((entries[0].weight.value() - 1.5).abs() < 0.001);
    }

    #[test]
    fn test_register_decomposition() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        let id1 = builder.add_subword(
            "un",
            &["AH", "N"],
            SubwordPosition::Initial,
            TropicalWeight::one(),
        );
        let id2 = builder.add_subword(
            "break",
            &["B", "R", "EY", "K"],
            SubwordPosition::Medial,
            TropicalWeight::one(),
        );
        let id3 = builder.add_subword(
            "able",
            &["AH", "B", "AH", "L"],
            SubwordPosition::Final,
            TropicalWeight::one(),
        );

        // Register word decomposition
        let word_id: WordId = 42;
        builder.register_decomposition(word_id, vec![id1, id2, id3]);

        let decomp = builder.get_decomposition(word_id);
        assert_eq!(decomp, Some(&[id1, id2, id3][..]));
    }

    #[test]
    fn test_phone_interning() {
        let mut builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        builder.add_word("aaa", &["AH", "AH", "AH"], TropicalWeight::one());
        builder.add_word("bbb", &["B", "B", "B"], TropicalWeight::one());

        // Should only have 2 unique phones
        assert_eq!(builder.phone_vocab_size(), 2);
        assert_eq!(builder.get_phone_name(0), Some("AH"));
        assert_eq!(builder.get_phone_name(1), Some("B"));
    }

    #[test]
    fn test_empty_builder() {
        let builder: SubwordLexiconBuilder<TropicalWeight> =
            SubwordLexiconBuilder::new(MarkingStyle::LeftMarked);

        assert_eq!(builder.vocab_size(), 0);
        assert_eq!(builder.phone_vocab_size(), 0);
        assert!(builder.entries().is_empty());

        let fst = builder.build_lexicon_fst();
        assert_eq!(fst.num_states(), 1); // Just start state
    }
}