panini-lang-core 0.3.0

Core traits and types for the Panini linguistic feature extraction framework
Documentation
use std::fmt::Debug;
use std::hash::Hash;

use crate::aggregable::FieldDescriptor;
use serde::{Deserialize, Serialize};

pub use crate::morphology_enums::*;

/// Common interface for language-specific morphology enums.
/// Allows generic code to extract the lemma and `PoS` label from any morphology variant.
pub trait MorphologyInfo {
    /// Auto-generated POS tag type for use in morpheme inventory `applies_to` fields.
    /// Generated by `#[derive(MorphologyInfo)]` as `<Name>PosTag`.
    type PosTag: Debug + Clone + Copy + PartialEq + Eq + Hash;

    /// The dictionary form of the word.
    fn lemma(&self) -> &str;
    /// The typed POS tag (variant of the auto-generated `<Name>PosTag` enum).
    fn pos_tag(&self) -> Self::PosTag;
    /// The part-of-speech label (e.g. "Noun", "Verb").
    fn pos_label(&self) -> &'static str;
}

/// Static schema for one morphology group such as `noun` or `verb`.
#[derive(Debug, Clone)]
pub struct MorphologyGroupSchema {
    pub key: String,
    pub label: String,
    pub dimensions: Vec<FieldDescriptor>,
}

/// Exposes compile-time morphology descriptors without requiring runtime samples.
pub trait MorphologyCatalog {
    fn group_descriptors() -> Vec<MorphologyGroupSchema>;
}

/// Static schema for one grammatical-function variant such as `case` or `agreement`.
#[derive(Debug, Clone)]
pub struct FunctionVariantSchema {
    pub key: String,
    pub label: String,
    pub dimensions: Vec<FieldDescriptor>,
}

/// Exposes compile-time grammatical-function descriptors
pub trait GrammaticalFunctionCatalog {
    fn function_descriptors() -> Vec<FunctionVariantSchema>;
}

impl GrammaticalFunctionCatalog for () {
    fn function_descriptors() -> Vec<FunctionVariantSchema> {
        vec![]
    }
}

/// Re-export `isolang::Language` as `IsoLang` so downstream crates don't need `isolang` directly.
pub use isolang::Language as IsoLang;

/// An ISO 15924 script code (e.g., "Latn", "Hira", "Kana").
/// Thin wrapper around a `&'static str` code. Use `.resolve()` to get full ISO data.
#[derive(Clone, Copy)]
pub struct Script(&'static str);

impl Script {
    pub const LATN: Self = Self("Latn");
    pub const CYRL: Self = Self("Cyrl");
    pub const HIRA: Self = Self("Hira");
    pub const KANA: Self = Self("Kana");
    pub const HANI: Self = Self("Hani");
    pub const ARAB: Self = Self("Arab");
    pub const HANG: Self = Self("Hang");

    /// Returns the 4-character ISO 15924 code.
    #[must_use]
    pub const fn code(&self) -> &'static str {
        self.0
    }

    /// Constructs a `Script` from any valid ISO 15924 code string.
    /// Returns `None` if the code is not in the standard.
    #[must_use]
    pub fn new(code: &str) -> Option<Self> {
        let entry = iso15924::ScriptCode::by_code(code)?;
        Some(Self(entry.code.as_ref()))
    }

    /// Looks up the full ISO 15924 data for this script.
    ///
    /// # Panics
    /// Panics if the internal script code is invalid (should never happen if constructed safely).
    #[must_use]
    pub fn resolve(&self) -> &'static iso15924::ScriptCode<'static> {
        iso15924::ScriptCode::by_code(self.0)
            .unwrap_or_else(|| panic!("Invalid ISO 15924 script code: {}", self.0))
    }
}

impl Debug for Script {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Script(\"{}\")", self.0)
    }
}

impl std::fmt::Display for Script {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.0)
    }
}

impl PartialEq for Script {
    fn eq(&self, other: &Self) -> bool {
        self.0 == other.0
    }
}

impl Eq for Script {}

impl Hash for Script {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.0.hash(state);
    }
}

impl Serialize for Script {
    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        serializer.serialize_str(self.0)
    }
}

impl<'de> Deserialize<'de> for Script {
    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
        let code = String::deserialize(deserializer)?;
        Self::new(&code).ok_or_else(|| {
            serde::de::Error::custom(format!("Unknown ISO 15924 script code: {code}"))
        })
    }
}

/// Typological features of a language that influence its behavior or available card models.
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum TypologicalFeature {
    /// The language features verb conjugation (e.g. Polish, French, Spanish).
    Conjugation,
    /// The language features declension (e.g. Polish, Arabic, Turkish...).
    Declension,
    /// The language is agglutinative (e.g. Turkish, Finnish, Korean).
    Agglutination,
}

/// Defines a language's linguistic properties for morphological feature extraction.
///
/// This is the core trait for Panini — it captures everything needed to extract
/// morphological features from text, independent of any particular application.
pub trait LinguisticDefinition {
    /// The language-specific morphology enum. Each variant represents a `PoS` category
    /// with its morphological fields (lemma, case, gender, aspect, etc.).
    ///
    /// Requires `Aggregable + DeserializeOwned` so `MorphologyAnalysis` can implement
    /// `Aggregating<L>` for all `L: LinguisticDefinition` without conditional impls.
    /// All current language morphology enums satisfy this via `#[derive(MorphologyInfo)]`.
    type Morphology: Debug
        + Clone
        + Serialize
        + for<'de> Deserialize<'de>
        + schemars::JsonSchema
        + MorphologyInfo
        + crate::aggregable::Aggregable
        + Send
        + Sync;

    /// The grammatical function type for morpheme segmentation.
    /// Non-agglutinative languages set this to `()`.
    /// Agglutinative languages set this to their wrapper enum (e.g., `TurkishGrammaticalFunction`).
    ///
    /// Requires `AggregableFields + DeserializeOwned` so `MorphemeSegmentation` can implement
    /// `Aggregating<L>` for all `L: LinguisticDefinition` without conditional impls.
    type GrammaticalFunction: Debug
        + Clone
        + PartialEq
        + Serialize
        + for<'de> Deserialize<'de>
        + schemars::JsonSchema
        + crate::aggregable::AggregableFields
        + Send
        + Sync;

    /// The ISO 639-3 language as a typed enum variant — invalid codes are impossible to express.
    const ISO_LANG: IsoLang;

    /// Returns the ISO 639-3 three-letter code string (e.g. `"pol"`, `"tur"`, `"fra"`).
    fn iso_code(&self) -> &'static str {
        Self::ISO_LANG.to_639_3()
    }

    /// The English name of the language, auto-derived from `ISO_LANG`.
    fn name(&self) -> &str {
        Self::ISO_LANG.to_name()
    }

    /// The scripts supported by this language.
    fn supported_scripts(&self) -> &[Script];

    /// The default script for the language.
    fn default_script(&self) -> Script;

    /// Language-specific extraction directives for the feature extractor.
    fn extraction_directives(&self) -> &str;

    /// Specific typological features of this language.
    fn typological_features(&self) -> &[TypologicalFeature] {
        &[]
    }

    /// Extra directives to append to the extraction prompt.
    fn extra_extraction_directives(&self) -> Option<String> {
        None
    }

    /// Post-process the morpheme segmentation returned by the LLM.
    ///
    /// # Errors
    /// Returns a string describing any validation errors encountered during post-processing.
    fn post_process_extraction(
        &self,
        _segmentation: &mut Option<
            Vec<crate::morpheme::WordSegmentation<Self::GrammaticalFunction>>,
        >,
    ) -> Result<(), String> {
        Ok(())
    }
}