panini-lang-core 0.3.0

Core traits and types for the Panini linguistic feature extraction framework
Documentation
use std::fmt::Debug;
use std::hash::Hash;

use serde::{Deserialize, Serialize};

use crate::aggregable::{Aggregable, AggregableFields, FieldDescriptor, FieldKind};
use crate::domain::ExtractedFeature;
use crate::traits::{LinguisticDefinition, MorphologyInfo};

// ─── Morpheme types ───────────────────────────────────────────────────────────

/// Static definition of a grammatical morpheme in a language's inventory.
///
/// `F` = the language's `GrammaticalFunction` wrapper enum.
/// `P` = the language's `PosTag` type (generated by `#[derive(MorphologyInfo)]`).
pub struct MorphemeDefinition<F: 'static, P: 'static> {
    /// Archiphonemic notation identifying the morpheme (e.g., `"DA"`, `"mA"`, `"(y)AcAk"`).
    pub base_form: &'static str,
    /// All grammatical functions this morpheme can serve.
    pub functions: &'static [F],
    /// POS categories this morpheme attaches to. Same type as `MorphologyInfo::PosTag`.
    pub applies_to: &'static [P],
}

/// A single morpheme instance extracted by the LLM from a word.
#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[schemars(bound = "F: schemars::JsonSchema")]
pub struct ExtractedMorpheme<F> {
    /// The allomorph as it actually appears in the word (e.g., `"me"` for the morpheme `"mA"`).
    pub surface: String,
    /// Archiphonemic base form identifying which morpheme this is (validated against inventory).
    pub base_form: String,
    /// The grammatical function this morpheme serves in context.
    pub function: F,
}

/// All morphemes extracted from a single word.
#[derive(Debug, Clone, Serialize, Deserialize, schemars::JsonSchema)]
#[schemars(bound = "F: schemars::JsonSchema")]
pub struct WordSegmentation<F> {
    /// Surface form of the whole word.
    pub word: String,
    /// Ordered list of extracted morphemes (excluding the root/stem, which is in `lemma`).
    pub morphemes: Vec<ExtractedMorpheme<F>>,
}

// ─── Aggregable impls ─────────────────────────────────────────────────────────

/// Each `ExtractedMorpheme` is one aggregable unit — group `"morpheme"`,
/// `total_increment = 1` (via the typed shim on `AggregationSink`).
///
/// Fields: `base_form` (open) + whatever `F: AggregableFields` contributes.
/// This ensures `GroupResult.total` for the `"morpheme"` group counts individual
/// morphemes, not segmented words (Option A semantics).
impl<F: AggregableFields> Aggregable for ExtractedMorpheme<F> {
    fn group_key(&self) -> String {
        "morpheme".to_string()
    }

    fn instance_descriptors(&self) -> Vec<FieldDescriptor> {
        let mut d = vec![FieldDescriptor {
            name: "base_form".into(),
            kind: FieldKind::Open,
        }];
        d.extend(F::descriptors());
        d
    }

    fn observations(&self) -> Vec<Vec<(String, String)>> {
        let mut obs = vec![("base_form".to_string(), self.base_form.clone())];
        obs.extend(self.function.field_values());
        vec![obs]
    }
}

/// Delegate `Aggregable` from `ExtractedFeature<M>` to the inner `morphology`.
impl<M: Aggregable> Aggregable for ExtractedFeature<M> {
    fn group_key(&self) -> String {
        self.morphology.group_key()
    }

    fn instance_descriptors(&self) -> Vec<FieldDescriptor> {
        self.morphology.instance_descriptors()
    }

    fn observations(&self) -> Vec<Vec<(String, String)>> {
        self.morphology.observations()
    }
}

// ─── Agglutinative trait ──────────────────────────────────────────────────────

/// Opt-in trait for agglutinative languages.
///
/// Provides the morpheme-specific methods: static inventory, LLM directives, and
/// typed validation. The `LinguisticDefinition` extension points
/// (`extra_extraction_directives`, `post_process_extraction`) should delegate here.
pub trait Agglutinative: LinguisticDefinition
where
    <Self::Morphology as MorphologyInfo>::PosTag:
        Debug + Clone + Copy + PartialEq + Eq + Hash + 'static,
    Self::GrammaticalFunction: Debug
        + Clone
        + PartialEq
        + Serialize
        + for<'de> Deserialize<'de>
        + schemars::JsonSchema
        + Send
        + Sync
        + 'static,
{
    /// The full static inventory of morphemes for this language.
    fn morpheme_inventory() -> &'static [MorphemeDefinition<
        Self::GrammaticalFunction,
        <Self::Morphology as MorphologyInfo>::PosTag,
    >];

    /// Directives for the LLM explaining how to fill `morpheme_segmentation`.
    fn morpheme_directives(&self) -> String;

    /// Parse and validate morpheme segmentation from the LLM response.
    ///
    /// # Errors
    /// Returns a string containing all validation errors if any morphemes
    /// have an unknown `base_form` or an invalid `function` according to the inventory.
    fn validate_and_enrich(
        &self,
        segmentation: &mut Option<Vec<WordSegmentation<Self::GrammaticalFunction>>>,
    ) -> Result<(), String> {
        let Some(segs) = segmentation.as_mut() else {
            return Ok(());
        };

        let mut errors = Vec::new();
        let inventory = Self::morpheme_inventory();

        for seg in segs.iter_mut() {
            let word = &seg.word;
            for morpheme in &mut seg.morphemes {
                let definition = inventory.iter().find(|d| d.base_form == morpheme.base_form);

                let Some(def) = definition else {
                    errors.push(format!(
                        "Unknown morpheme base_form '{}' for word '{}'. Use only base_forms from the inventory.",
                        morpheme.base_form, word
                    ));
                    continue;
                };

                if !def.functions.contains(&morpheme.function) {
                    if def.functions.len() == 1 {
                        morpheme.function = def.functions[0].clone();
                    } else {
                        errors.push(format!(
                            "Invalid function {:?} for morpheme '{}' in word '{}'. Valid functions: {:?}",
                            morpheme.function, morpheme.base_form, word, def.functions
                        ));
                    }
                }
            }
        }

        if errors.is_empty() {
            Ok(())
        } else {
            Err(errors.join("\n"))
        }
    }
}