Skip to main content

anno/preprocess/
morphology.rs

1//! Morphological preprocessing for polysynthetic and agglutinative languages.
2//!
3//! # Overview
4//!
5//! This module provides preprocessing support for morphologically complex languages
6//! where standard tokenization fails. Polysynthetic languages (Cherokee, Navajo, Mohawk)
7//! encode entire sentences in single words; agglutinative languages (Quechua, Turkish)
8//! have productive morpheme concatenation.
9//!
10//! # Problem Statement
11//!
12//! Standard NER assumes word-level spans work well for entity boundaries. For polysynthetic
13//! languages, a single word may contain:
14//! - Subject, object, and verb
15//! - Tense, aspect, mood markers
16//! - Evidentiality markers
17//! - Named entity references
18//!
19//! Example (Mohawk): "wahshakotahráhkwen" = "he told someone something about him"
20//!
21//! # Approach
22//!
23//! 1. **Morpheme segmentation**: Split words into morphemes before NER
24//! 2. **Entity span mapping**: Map morpheme spans back to character offsets
25//! 3. **Pro-drop handling**: Insert placeholder nodes for null arguments
26//!
27//! # Example
28//!
29//! ```rust,ignore
30//! use anno::preprocess::morphology::{MorphologicalPreprocessor, SegmentationStrategy};
31//!
32//! let preprocessor = MorphologicalPreprocessor::new()
33//!     .with_strategy(SegmentationStrategy::BPE { vocab_size: 5000 })
34//!     .with_prodrop_expansion(true);
35//!
36//! let segmented = preprocessor.segment("wahshakotahráhkwen")?;
37//! // Returns morpheme sequence with offset mapping
38//! ```
39//!
40//! # References
41//!
42//! - qxoRef (Quechua): 3,137 morphemes across 1,413 words
43//! - Cherokee syllabary: 85 characters representing CV syllables
44//! - Navajo: Complex verbal morphology with prefix templates
45
46use crate::offset::TextSpan;
47use crate::{Error, Result};
48use serde::{Deserialize, Serialize};
49use std::collections::HashMap;
50
51/// Strategy for morphological segmentation.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub enum SegmentationStrategy {
54    /// Byte-Pair Encoding (BPE) based segmentation
55    BPE {
56        /// Target vocabulary size
57        vocab_size: usize,
58    },
59    /// Character-level segmentation (fallback)
60    Character,
61    /// Syllable-based segmentation (for syllabic scripts like Cherokee)
62    Syllable,
63    /// Rule-based segmentation using morpheme boundaries
64    RuleBased {
65        /// Boundary markers (e.g., "-" for hyphenated morphemes)
66        boundary_chars: Vec<char>,
67    },
68    /// External morphological analyzer (FST-based)
69    External {
70        /// Path to analyzer model
71        model_path: String,
72    },
73}
74
75impl Default for SegmentationStrategy {
76    fn default() -> Self {
77        SegmentationStrategy::RuleBased {
78            boundary_chars: vec!['-'],
79        }
80    }
81}
82
83/// A morpheme with its position in the original text.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct Morpheme {
86    /// The morpheme text
87    pub text: String,
88    /// Start offset in original text (character)
89    pub start: usize,
90    /// End offset in original text (character)
91    pub end: usize,
92    /// Morpheme type (if known)
93    pub morph_type: Option<MorphemeType>,
94    /// Gloss (if available)
95    pub gloss: Option<String>,
96}
97
98/// Types of morphemes.
99#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100pub enum MorphemeType {
101    /// Root/stem morpheme
102    Root,
103    /// Prefix
104    Prefix,
105    /// Suffix
106    Suffix,
107    /// Infix
108    Infix,
109    /// Circumfix
110    Circumfix,
111    /// Clitic
112    Clitic,
113    /// Unknown type
114    Unknown,
115}
116
117/// Result of morphological segmentation.
118#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct SegmentationResult {
120    /// Original text
121    pub original: String,
122    /// Sequence of morphemes
123    pub morphemes: Vec<Morpheme>,
124    /// Whether pro-drop placeholders were inserted
125    pub has_prodrop_placeholders: bool,
126    /// Mapping from morpheme indices to character spans
127    pub span_map: Vec<(usize, usize)>,
128}
129
130impl SegmentationResult {
131    /// Get morpheme text joined with separator.
132    pub fn joined(&self, separator: &str) -> String {
133        self.morphemes
134            .iter()
135            .map(|m| m.text.as_str())
136            .collect::<Vec<_>>()
137            .join(separator)
138    }
139
140    /// Map a morpheme span back to character offsets.
141    pub fn morpheme_to_char_span(
142        &self,
143        morph_start: usize,
144        morph_end: usize,
145    ) -> Option<(usize, usize)> {
146        if morph_start >= self.morphemes.len() || morph_end > self.morphemes.len() {
147            return None;
148        }
149        let char_start = self.morphemes[morph_start].start;
150        let char_end = self.morphemes[morph_end - 1].end;
151        Some((char_start, char_end))
152    }
153}
154
155/// Configuration for pro-drop handling.
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct ProdropConfig {
158    /// Insert placeholder for null subjects
159    pub expand_null_subjects: bool,
160    /// Insert placeholder for null objects
161    pub expand_null_objects: bool,
162    /// Placeholder token for null arguments
163    pub placeholder_token: String,
164}
165
166impl Default for ProdropConfig {
167    fn default() -> Self {
168        Self {
169            expand_null_subjects: true,
170            expand_null_objects: false,
171            placeholder_token: "[NULL]".to_string(),
172        }
173    }
174}
175
176/// Preprocessor for morphologically complex languages.
177pub struct MorphologicalPreprocessor {
178    strategy: SegmentationStrategy,
179    prodrop_config: Option<ProdropConfig>,
180    /// BPE vocabulary (if using BPE strategy)
181    bpe_vocab: Option<HashMap<String, usize>>,
182    /// Syllable inventory (if using syllable strategy)
183    syllable_inventory: Option<Vec<String>>,
184}
185
186impl MorphologicalPreprocessor {
187    /// Create a new preprocessor with default settings.
188    pub fn new() -> Self {
189        Self {
190            strategy: SegmentationStrategy::default(),
191            prodrop_config: None,
192            bpe_vocab: None,
193            syllable_inventory: None,
194        }
195    }
196
197    /// Set the segmentation strategy.
198    pub fn with_strategy(mut self, strategy: SegmentationStrategy) -> Self {
199        self.strategy = strategy;
200        self
201    }
202
203    /// Enable pro-drop expansion.
204    pub fn with_prodrop_expansion(mut self, config: ProdropConfig) -> Self {
205        self.prodrop_config = Some(config);
206        self
207    }
208
209    /// Load BPE vocabulary from file.
210    pub fn load_bpe_vocab(&mut self, vocab: HashMap<String, usize>) {
211        self.bpe_vocab = Some(vocab);
212    }
213
214    /// Load syllable inventory for syllabic scripts.
215    pub fn load_syllable_inventory(&mut self, inventory: Vec<String>) {
216        self.syllable_inventory = Some(inventory);
217    }
218
219    /// Segment text into morphemes.
220    pub fn segment(&self, text: &str) -> Result<SegmentationResult> {
221        let morphemes = match &self.strategy {
222            SegmentationStrategy::BPE { vocab_size: _ } => self.segment_bpe(text)?,
223            SegmentationStrategy::Character => self.segment_character(text),
224            SegmentationStrategy::Syllable => self.segment_syllable(text)?,
225            SegmentationStrategy::RuleBased { boundary_chars } => {
226                self.segment_rule_based(text, boundary_chars)
227            }
228            SegmentationStrategy::External { model_path: _ } => {
229                // External analyzers would be called here
230                return Err(Error::FeatureNotAvailable(
231                    "External morphological analyzer not yet implemented".to_string(),
232                ));
233            }
234        };
235
236        let span_map: Vec<(usize, usize)> = morphemes.iter().map(|m| (m.start, m.end)).collect();
237
238        Ok(SegmentationResult {
239            original: text.to_string(),
240            morphemes,
241            has_prodrop_placeholders: false,
242            span_map,
243        })
244    }
245
246    /// Character-level segmentation (baseline).
247    fn segment_character(&self, text: &str) -> Vec<Morpheme> {
248        text.char_indices()
249            .map(|(i, c)| Morpheme {
250                text: c.to_string(),
251                start: i,
252                end: i + c.len_utf8(),
253                morph_type: Some(MorphemeType::Unknown),
254                gloss: None,
255            })
256            .collect()
257    }
258
259    /// Rule-based segmentation using boundary characters.
260    fn segment_rule_based(&self, text: &str, boundary_chars: &[char]) -> Vec<Morpheme> {
261        let mut morphemes = Vec::new();
262        let mut current_start = 0;
263        let mut current_text = String::new();
264
265        for (i, c) in text.char_indices() {
266            if boundary_chars.contains(&c) {
267                // Save current morpheme if non-empty
268                if !current_text.is_empty() {
269                    let span = TextSpan::from_bytes(text, current_start, i);
270                    morphemes.push(Morpheme {
271                        text: current_text.clone(),
272                        start: span.char_start,
273                        end: span.char_end,
274                        morph_type: Some(MorphemeType::Unknown),
275                        gloss: None,
276                    });
277                    current_text.clear();
278                }
279                current_start = i + c.len_utf8();
280            } else {
281                if current_text.is_empty() {
282                    current_start = i;
283                }
284                current_text.push(c);
285            }
286        }
287
288        // Don't forget the last morpheme
289        if !current_text.is_empty() {
290            let span = TextSpan::from_bytes(text, current_start, text.len());
291            morphemes.push(Morpheme {
292                text: current_text,
293                start: span.char_start,
294                end: span.char_end,
295                morph_type: Some(MorphemeType::Unknown),
296                gloss: None,
297            });
298        }
299
300        morphemes
301    }
302
303    /// Syllable-based segmentation (for Cherokee, etc.).
304    fn segment_syllable(&self, text: &str) -> Result<Vec<Morpheme>> {
305        let inventory = self
306            .syllable_inventory
307            .as_ref()
308            .ok_or_else(|| Error::InvalidInput("Syllable inventory not loaded".to_string()))?;
309
310        let mut morphemes = Vec::new();
311        let mut pos = 0; // byte offset
312
313        // Greedy matching from syllable inventory
314        while pos < text.len() {
315            let mut matched = false;
316            let remaining = &text[pos..];
317
318            // Try to match longest syllable first
319            for syllable in inventory.iter().rev() {
320                // Assumes sorted by length
321                if remaining.starts_with(syllable) {
322                    let span = TextSpan::from_bytes(text, pos, pos + syllable.len());
323                    morphemes.push(Morpheme {
324                        text: syllable.clone(),
325                        start: span.char_start,
326                        end: span.char_end,
327                        morph_type: Some(MorphemeType::Unknown),
328                        gloss: None,
329                    });
330                    pos += syllable.len();
331                    matched = true;
332                    break;
333                }
334            }
335
336            // Fallback to single character if no syllable matches
337            if !matched {
338                let c = text[pos..]
339                    .chars()
340                    .next()
341                    .expect("pos should be within text bounds");
342                let span = TextSpan::from_bytes(text, pos, pos + c.len_utf8());
343                morphemes.push(Morpheme {
344                    text: c.to_string(),
345                    start: span.char_start,
346                    end: span.char_end,
347                    morph_type: Some(MorphemeType::Unknown),
348                    gloss: None,
349                });
350                pos += c.len_utf8();
351            }
352        }
353
354        Ok(morphemes)
355    }
356
357    /// BPE-based segmentation.
358    fn segment_bpe(&self, text: &str) -> Result<Vec<Morpheme>> {
359        let _vocab = self
360            .bpe_vocab
361            .as_ref()
362            .ok_or_else(|| Error::InvalidInput("BPE vocabulary not loaded".to_string()))?;
363
364        // Simplified BPE: character-level with merge rules
365        // Real implementation would use proper BPE algorithm
366        Ok(self.segment_character(text))
367    }
368}
369
370impl Default for MorphologicalPreprocessor {
371    fn default() -> Self {
372        Self::new()
373    }
374}
375
376/// Cherokee syllabary inventory (85 syllables).
377///
378/// Returns the Cherokee syllables sorted by length (longest first).
379pub fn cherokee_syllable_inventory() -> Vec<String> {
380    // Cherokee syllabary characters (U+13A0 to U+13F4)
381    let syllables: Vec<String> = (0x13A0..=0x13F4)
382        .filter_map(char::from_u32)
383        .map(|c| c.to_string())
384        .collect();
385    syllables
386}
387
388/// Common Quechua morpheme boundaries.
389pub fn quechua_boundary_chars() -> Vec<char> {
390    vec!['-', '='] // Hyphen for morpheme, equals for clitic
391}
392
393/// Common Navajo prefix templates.
394///
395/// Navajo verbs have a complex template of prefix positions.
396/// This returns common prefix morphemes.
397pub fn navajo_prefix_inventory() -> Vec<String> {
398    vec![
399        // Object markers
400        "shi-".to_string(), // 1sg object
401        "ni-".to_string(),  // 2sg object
402        "bi-".to_string(),  // 3rd person object
403        // Subject markers
404        "-ish".to_string(), // 1sg subject
405        "-í".to_string(),   // 2sg subject
406        // Aspect markers
407        "yi-".to_string(), // perfective
408        "na-".to_string(), // iterative
409    ]
410}
411
412/// Trait for morphological analysis.
413///
414/// Implement this trait to integrate external morphological analyzers
415/// (e.g., FST-based analyzers like HFST, Foma, or language-specific tools).
416pub trait MorphologicalAnalyzer: Send + Sync {
417    /// Analyze a word and return its morphemes.
418    fn analyze(&self, word: &str) -> Result<Vec<Morpheme>>;
419
420    /// Get the language code this analyzer supports.
421    fn language_code(&self) -> &str;
422
423    /// Whether this analyzer supports glossing.
424    fn supports_glossing(&self) -> bool {
425        false
426    }
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432
433    #[test]
434    fn test_rule_based_segmentation_offsets_are_character_offsets_on_unicode() {
435        let preprocessor =
436            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
437                boundary_chars: vec!['-'],
438            });
439
440        // ü is multi-byte; offsets must still be character offsets.
441        let text = "über-alles";
442        let result = preprocessor.segment(text).expect("segment");
443        assert_eq!(result.morphemes.len(), 2);
444
445        assert_eq!(result.morphemes[0].text, "über");
446        assert_eq!(result.morphemes[0].start, 0);
447        assert_eq!(result.morphemes[0].end, 4);
448
449        assert_eq!(result.morphemes[1].text, "alles");
450        assert_eq!(result.morphemes[1].start, 5);
451        assert_eq!(result.morphemes[1].end, 10);
452    }
453
454    #[test]
455    fn test_rule_based_segmentation() {
456        let preprocessor =
457            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
458                boundary_chars: vec!['-'],
459            });
460
461        let result = preprocessor
462            .segment("wasi-kuna-y-ki")
463            .expect("valid Quechua word should segment");
464        assert_eq!(result.morphemes.len(), 4);
465        assert_eq!(result.morphemes[0].text, "wasi");
466        assert_eq!(result.morphemes[1].text, "kuna");
467        assert_eq!(result.morphemes[2].text, "y");
468        assert_eq!(result.morphemes[3].text, "ki");
469    }
470
471    #[test]
472    fn test_character_segmentation() {
473        let preprocessor =
474            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
475
476        let result = preprocessor.segment("hello").unwrap();
477        assert_eq!(result.morphemes.len(), 5);
478    }
479
480    #[test]
481    fn test_span_mapping() {
482        let preprocessor =
483            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
484                boundary_chars: vec!['-'],
485            });
486
487        let result = preprocessor
488            .segment("wasi-kuna")
489            .expect("Quechua compound should segment");
490
491        // Map morphemes 0-2 (both morphemes) back to character span
492        let span = result
493            .morpheme_to_char_span(0, 2)
494            .expect("valid morpheme indices should map to span");
495        assert_eq!(span, (0, 9)); // "wasi-kuna".len() == 9
496    }
497
498    #[test]
499    fn test_cherokee_inventory() {
500        let inventory = cherokee_syllable_inventory();
501        assert!(!inventory.is_empty());
502        // Cherokee syllabary has 85+ characters
503        assert!(inventory.len() >= 85);
504    }
505
506    #[test]
507    fn test_empty_string_handling() {
508        let preprocessor =
509            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
510        let result = preprocessor.segment("").unwrap();
511        assert!(result.morphemes.is_empty());
512        assert_eq!(result.original, "");
513    }
514
515    #[test]
516    fn test_unicode_handling() {
517        let preprocessor =
518            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
519
520        // Cherokee syllabary
521        let result = preprocessor
522            .segment("ᏣᎳᎩ")
523            .expect("Cherokee word should segment");
524        assert_eq!(result.morphemes.len(), 3);
525
526        // Nahuatl with diacritics
527        let result = preprocessor.segment("Nāhuatl").unwrap();
528        assert_eq!(result.morphemes.len(), 7);
529    }
530
531    #[test]
532    fn test_rule_based_boundary_only() {
533        let preprocessor =
534            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
535                boundary_chars: vec!['-'],
536            });
537
538        // Input with only boundary chars
539        let result = preprocessor
540            .segment("---")
541            .expect("punctuation should segment");
542        assert!(result.morphemes.is_empty());
543    }
544
545    #[test]
546    fn test_rule_based_no_boundaries() {
547        let preprocessor =
548            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
549                boundary_chars: vec!['-'],
550            });
551
552        // Input with no boundary chars
553        let result = preprocessor.segment("word").unwrap();
554        assert_eq!(result.morphemes.len(), 1);
555        assert_eq!(result.morphemes[0].text, "word");
556    }
557
558    #[test]
559    fn test_quechua_segmentation() {
560        let preprocessor =
561            MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
562                boundary_chars: quechua_boundary_chars(),
563            });
564
565        // Quechua word with hyphens
566        let result = preprocessor
567            .segment("wasi-kuna-y-ki")
568            .expect("valid Quechua word should segment");
569        assert_eq!(result.morphemes.len(), 4);
570
571        // Verify span mapping works
572        assert_eq!(
573            result
574                .morpheme_to_char_span(0, 1)
575                .expect("valid morpheme indices should map to span"),
576            (0, 4)
577        ); // "wasi"
578    }
579
580    #[test]
581    fn test_navajo_inventory() {
582        let inventory = navajo_prefix_inventory();
583        assert!(!inventory.is_empty());
584        // Should have basic Navajo prefixes (note: stored with hyphens)
585        assert!(inventory
586            .iter()
587            .any(|p| p.contains("na") || p.contains("ni") || p.contains("bi")));
588    }
589}