1use crate::offset::TextSpan;
47use crate::{Error, Result};
48use serde::{Deserialize, Serialize};
49use std::collections::HashMap;
50
51#[derive(Debug, Clone, Serialize, Deserialize)]
53pub enum SegmentationStrategy {
54 BPE {
56 vocab_size: usize,
58 },
59 Character,
61 Syllable,
63 RuleBased {
65 boundary_chars: Vec<char>,
67 },
68 External {
70 model_path: String,
72 },
73}
74
75impl Default for SegmentationStrategy {
76 fn default() -> Self {
77 SegmentationStrategy::RuleBased {
78 boundary_chars: vec!['-'],
79 }
80 }
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct Morpheme {
86 pub text: String,
88 pub start: usize,
90 pub end: usize,
92 pub morph_type: Option<MorphemeType>,
94 pub gloss: Option<String>,
96}
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
100pub enum MorphemeType {
101 Root,
103 Prefix,
105 Suffix,
107 Infix,
109 Circumfix,
111 Clitic,
113 Unknown,
115}
116
117#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct SegmentationResult {
120 pub original: String,
122 pub morphemes: Vec<Morpheme>,
124 pub has_prodrop_placeholders: bool,
126 pub span_map: Vec<(usize, usize)>,
128}
129
130impl SegmentationResult {
131 pub fn joined(&self, separator: &str) -> String {
133 self.morphemes
134 .iter()
135 .map(|m| m.text.as_str())
136 .collect::<Vec<_>>()
137 .join(separator)
138 }
139
140 pub fn morpheme_to_char_span(
142 &self,
143 morph_start: usize,
144 morph_end: usize,
145 ) -> Option<(usize, usize)> {
146 if morph_start >= self.morphemes.len() || morph_end > self.morphemes.len() {
147 return None;
148 }
149 let char_start = self.morphemes[morph_start].start;
150 let char_end = self.morphemes[morph_end - 1].end;
151 Some((char_start, char_end))
152 }
153}
154
155#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct ProdropConfig {
158 pub expand_null_subjects: bool,
160 pub expand_null_objects: bool,
162 pub placeholder_token: String,
164}
165
166impl Default for ProdropConfig {
167 fn default() -> Self {
168 Self {
169 expand_null_subjects: true,
170 expand_null_objects: false,
171 placeholder_token: "[NULL]".to_string(),
172 }
173 }
174}
175
176pub struct MorphologicalPreprocessor {
178 strategy: SegmentationStrategy,
179 prodrop_config: Option<ProdropConfig>,
180 bpe_vocab: Option<HashMap<String, usize>>,
182 syllable_inventory: Option<Vec<String>>,
184}
185
186impl MorphologicalPreprocessor {
187 pub fn new() -> Self {
189 Self {
190 strategy: SegmentationStrategy::default(),
191 prodrop_config: None,
192 bpe_vocab: None,
193 syllable_inventory: None,
194 }
195 }
196
197 pub fn with_strategy(mut self, strategy: SegmentationStrategy) -> Self {
199 self.strategy = strategy;
200 self
201 }
202
203 pub fn with_prodrop_expansion(mut self, config: ProdropConfig) -> Self {
205 self.prodrop_config = Some(config);
206 self
207 }
208
209 pub fn load_bpe_vocab(&mut self, vocab: HashMap<String, usize>) {
211 self.bpe_vocab = Some(vocab);
212 }
213
214 pub fn load_syllable_inventory(&mut self, inventory: Vec<String>) {
216 self.syllable_inventory = Some(inventory);
217 }
218
219 pub fn segment(&self, text: &str) -> Result<SegmentationResult> {
221 let morphemes = match &self.strategy {
222 SegmentationStrategy::BPE { vocab_size: _ } => self.segment_bpe(text)?,
223 SegmentationStrategy::Character => self.segment_character(text),
224 SegmentationStrategy::Syllable => self.segment_syllable(text)?,
225 SegmentationStrategy::RuleBased { boundary_chars } => {
226 self.segment_rule_based(text, boundary_chars)
227 }
228 SegmentationStrategy::External { model_path: _ } => {
229 return Err(Error::FeatureNotAvailable(
231 "External morphological analyzer not yet implemented".to_string(),
232 ));
233 }
234 };
235
236 let span_map: Vec<(usize, usize)> = morphemes.iter().map(|m| (m.start, m.end)).collect();
237
238 Ok(SegmentationResult {
239 original: text.to_string(),
240 morphemes,
241 has_prodrop_placeholders: false,
242 span_map,
243 })
244 }
245
246 fn segment_character(&self, text: &str) -> Vec<Morpheme> {
248 text.char_indices()
249 .map(|(i, c)| Morpheme {
250 text: c.to_string(),
251 start: i,
252 end: i + c.len_utf8(),
253 morph_type: Some(MorphemeType::Unknown),
254 gloss: None,
255 })
256 .collect()
257 }
258
259 fn segment_rule_based(&self, text: &str, boundary_chars: &[char]) -> Vec<Morpheme> {
261 let mut morphemes = Vec::new();
262 let mut current_start = 0;
263 let mut current_text = String::new();
264
265 for (i, c) in text.char_indices() {
266 if boundary_chars.contains(&c) {
267 if !current_text.is_empty() {
269 let span = TextSpan::from_bytes(text, current_start, i);
270 morphemes.push(Morpheme {
271 text: current_text.clone(),
272 start: span.char_start,
273 end: span.char_end,
274 morph_type: Some(MorphemeType::Unknown),
275 gloss: None,
276 });
277 current_text.clear();
278 }
279 current_start = i + c.len_utf8();
280 } else {
281 if current_text.is_empty() {
282 current_start = i;
283 }
284 current_text.push(c);
285 }
286 }
287
288 if !current_text.is_empty() {
290 let span = TextSpan::from_bytes(text, current_start, text.len());
291 morphemes.push(Morpheme {
292 text: current_text,
293 start: span.char_start,
294 end: span.char_end,
295 morph_type: Some(MorphemeType::Unknown),
296 gloss: None,
297 });
298 }
299
300 morphemes
301 }
302
303 fn segment_syllable(&self, text: &str) -> Result<Vec<Morpheme>> {
305 let inventory = self
306 .syllable_inventory
307 .as_ref()
308 .ok_or_else(|| Error::InvalidInput("Syllable inventory not loaded".to_string()))?;
309
310 let mut morphemes = Vec::new();
311 let mut pos = 0; while pos < text.len() {
315 let mut matched = false;
316 let remaining = &text[pos..];
317
318 for syllable in inventory.iter().rev() {
320 if remaining.starts_with(syllable) {
322 let span = TextSpan::from_bytes(text, pos, pos + syllable.len());
323 morphemes.push(Morpheme {
324 text: syllable.clone(),
325 start: span.char_start,
326 end: span.char_end,
327 morph_type: Some(MorphemeType::Unknown),
328 gloss: None,
329 });
330 pos += syllable.len();
331 matched = true;
332 break;
333 }
334 }
335
336 if !matched {
338 let c = text[pos..]
339 .chars()
340 .next()
341 .expect("pos should be within text bounds");
342 let span = TextSpan::from_bytes(text, pos, pos + c.len_utf8());
343 morphemes.push(Morpheme {
344 text: c.to_string(),
345 start: span.char_start,
346 end: span.char_end,
347 morph_type: Some(MorphemeType::Unknown),
348 gloss: None,
349 });
350 pos += c.len_utf8();
351 }
352 }
353
354 Ok(morphemes)
355 }
356
357 fn segment_bpe(&self, text: &str) -> Result<Vec<Morpheme>> {
359 let _vocab = self
360 .bpe_vocab
361 .as_ref()
362 .ok_or_else(|| Error::InvalidInput("BPE vocabulary not loaded".to_string()))?;
363
364 Ok(self.segment_character(text))
367 }
368}
369
370impl Default for MorphologicalPreprocessor {
371 fn default() -> Self {
372 Self::new()
373 }
374}
375
376pub fn cherokee_syllable_inventory() -> Vec<String> {
380 let syllables: Vec<String> = (0x13A0..=0x13F4)
382 .filter_map(char::from_u32)
383 .map(|c| c.to_string())
384 .collect();
385 syllables
386}
387
388pub fn quechua_boundary_chars() -> Vec<char> {
390 vec!['-', '='] }
392
393pub fn navajo_prefix_inventory() -> Vec<String> {
398 vec![
399 "shi-".to_string(), "ni-".to_string(), "bi-".to_string(), "-ish".to_string(), "-í".to_string(), "yi-".to_string(), "na-".to_string(), ]
410}
411
412pub trait MorphologicalAnalyzer: Send + Sync {
417 fn analyze(&self, word: &str) -> Result<Vec<Morpheme>>;
419
420 fn language_code(&self) -> &str;
422
423 fn supports_glossing(&self) -> bool {
425 false
426 }
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432
433 #[test]
434 fn test_rule_based_segmentation_offsets_are_character_offsets_on_unicode() {
435 let preprocessor =
436 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
437 boundary_chars: vec!['-'],
438 });
439
440 let text = "über-alles";
442 let result = preprocessor.segment(text).expect("segment");
443 assert_eq!(result.morphemes.len(), 2);
444
445 assert_eq!(result.morphemes[0].text, "über");
446 assert_eq!(result.morphemes[0].start, 0);
447 assert_eq!(result.morphemes[0].end, 4);
448
449 assert_eq!(result.morphemes[1].text, "alles");
450 assert_eq!(result.morphemes[1].start, 5);
451 assert_eq!(result.morphemes[1].end, 10);
452 }
453
454 #[test]
455 fn test_rule_based_segmentation() {
456 let preprocessor =
457 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
458 boundary_chars: vec!['-'],
459 });
460
461 let result = preprocessor
462 .segment("wasi-kuna-y-ki")
463 .expect("valid Quechua word should segment");
464 assert_eq!(result.morphemes.len(), 4);
465 assert_eq!(result.morphemes[0].text, "wasi");
466 assert_eq!(result.morphemes[1].text, "kuna");
467 assert_eq!(result.morphemes[2].text, "y");
468 assert_eq!(result.morphemes[3].text, "ki");
469 }
470
471 #[test]
472 fn test_character_segmentation() {
473 let preprocessor =
474 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
475
476 let result = preprocessor.segment("hello").unwrap();
477 assert_eq!(result.morphemes.len(), 5);
478 }
479
480 #[test]
481 fn test_span_mapping() {
482 let preprocessor =
483 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
484 boundary_chars: vec!['-'],
485 });
486
487 let result = preprocessor
488 .segment("wasi-kuna")
489 .expect("Quechua compound should segment");
490
491 let span = result
493 .morpheme_to_char_span(0, 2)
494 .expect("valid morpheme indices should map to span");
495 assert_eq!(span, (0, 9)); }
497
498 #[test]
499 fn test_cherokee_inventory() {
500 let inventory = cherokee_syllable_inventory();
501 assert!(!inventory.is_empty());
502 assert!(inventory.len() >= 85);
504 }
505
506 #[test]
507 fn test_empty_string_handling() {
508 let preprocessor =
509 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
510 let result = preprocessor.segment("").unwrap();
511 assert!(result.morphemes.is_empty());
512 assert_eq!(result.original, "");
513 }
514
515 #[test]
516 fn test_unicode_handling() {
517 let preprocessor =
518 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::Character);
519
520 let result = preprocessor
522 .segment("ᏣᎳᎩ")
523 .expect("Cherokee word should segment");
524 assert_eq!(result.morphemes.len(), 3);
525
526 let result = preprocessor.segment("Nāhuatl").unwrap();
528 assert_eq!(result.morphemes.len(), 7);
529 }
530
531 #[test]
532 fn test_rule_based_boundary_only() {
533 let preprocessor =
534 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
535 boundary_chars: vec!['-'],
536 });
537
538 let result = preprocessor
540 .segment("---")
541 .expect("punctuation should segment");
542 assert!(result.morphemes.is_empty());
543 }
544
545 #[test]
546 fn test_rule_based_no_boundaries() {
547 let preprocessor =
548 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
549 boundary_chars: vec!['-'],
550 });
551
552 let result = preprocessor.segment("word").unwrap();
554 assert_eq!(result.morphemes.len(), 1);
555 assert_eq!(result.morphemes[0].text, "word");
556 }
557
558 #[test]
559 fn test_quechua_segmentation() {
560 let preprocessor =
561 MorphologicalPreprocessor::new().with_strategy(SegmentationStrategy::RuleBased {
562 boundary_chars: quechua_boundary_chars(),
563 });
564
565 let result = preprocessor
567 .segment("wasi-kuna-y-ki")
568 .expect("valid Quechua word should segment");
569 assert_eq!(result.morphemes.len(), 4);
570
571 assert_eq!(
573 result
574 .morpheme_to_char_span(0, 1)
575 .expect("valid morpheme indices should map to span"),
576 (0, 4)
577 ); }
579
580 #[test]
581 fn test_navajo_inventory() {
582 let inventory = navajo_prefix_inventory();
583 assert!(!inventory.is_empty());
584 assert!(inventory
586 .iter()
587 .any(|p| p.contains("na") || p.contains("ni") || p.contains("bi")));
588 }
589}