use std::fmt::Debug;
use std::hash::Hash;
use serde::{Deserialize, Serialize};
use crate::aggregable::{Aggregable, AggregableFields, FieldDescriptor, FieldKind};
use crate::domain::ExtractedFeature;
use crate::traits::{LinguisticDefinition, MorphologyInfo};
pub struct MorphemeDefinition<F: 'static, P: 'static> {
pub base_form: &'static str,
pub functions: &'static [F],
pub applies_to: &'static [P],
}
#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[schemars(bound = "F: schemars::JsonSchema")]
pub struct ExtractedMorpheme<F> {
pub surface: String,
pub base_form: String,
pub function: F,
}
#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[schemars(bound = "F: schemars::JsonSchema")]
pub struct WordSegmentation<F> {
pub word: String,
pub morphemes: Vec<ExtractedMorpheme<F>>,
}
impl<F: AggregableFields> Aggregable for ExtractedMorpheme<F> {
fn group_key(&self) -> String {
"morpheme".to_string()
}
fn instance_descriptors(&self) -> Vec<FieldDescriptor> {
let mut d = vec![FieldDescriptor {
name: "base_form".into(),
kind: FieldKind::Open,
}];
d.extend(F::descriptors());
d
}
fn observations(&self) -> Vec<Vec<(String, String)>> {
let mut obs = vec![("base_form".to_string(), self.base_form.clone())];
obs.extend(self.function.field_values());
vec![obs]
}
}
impl<M: Aggregable> Aggregable for ExtractedFeature<M> {
fn group_key(&self) -> String {
self.morphology.group_key()
}
fn instance_descriptors(&self) -> Vec<FieldDescriptor> {
self.morphology.instance_descriptors()
}
fn observations(&self) -> Vec<Vec<(String, String)>> {
self.morphology.observations()
}
}
pub trait Agglutinative: LinguisticDefinition
where
<Self::Morphology as MorphologyInfo>::PosTag:
Debug + Clone + Copy + PartialEq + Eq + Hash + 'static,
Self::GrammaticalFunction: Debug
+ Clone
+ PartialEq
+ Serialize
+ for<'de> Deserialize<'de>
+ schemars::JsonSchema
+ Send
+ Sync
+ 'static,
{
fn morpheme_inventory() -> &'static [MorphemeDefinition<
Self::GrammaticalFunction,
<Self::Morphology as MorphologyInfo>::PosTag,
>];
fn morpheme_directives(&self) -> String;
fn validate_and_enrich(
&self,
segmentation: &mut Option<Vec<WordSegmentation<Self::GrammaticalFunction>>>,
) -> Result<(), String> {
let Some(segs) = segmentation.as_mut() else {
return Ok(());
};
let mut errors = Vec::new();
let inventory = Self::morpheme_inventory();
for seg in segs.iter_mut() {
let word = &seg.word;
for morpheme in &mut seg.morphemes {
let definition = inventory.iter().find(|d| d.base_form == morpheme.base_form);
let Some(def) = definition else {
errors.push(format!(
"Unknown morpheme base_form '{}' for word '{}'. Use only base_forms from the inventory.",
morpheme.base_form, word
));
continue;
};
if !def.functions.contains(&morpheme.function) {
if def.functions.len() == 1 {
morpheme.function = def.functions[0].clone();
} else {
errors.push(format!(
"Invalid function {:?} for morpheme '{}' in word '{}'. Valid functions: {:?}",
morpheme.function, morpheme.base_form, word, def.functions
));
}
}
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors.join("\n"))
}
}
}