panini-lang-core 0.3.0

Core traits and types for the Panini linguistic feature extraction framework
Documentation
use std::fmt::Debug;

use serde::de::DeserializeOwned;

use crate::aggregable::digest::{record_aggregable, AggregationSink};
use crate::aggregable::Aggregable;
use crate::component::{AggregationError, Aggregating, AnalysisComponent, ComponentContext};
use crate::domain::ExtractedFeature;
use crate::traits::LinguisticDefinition;

// ─── MorphSection ─────────────────────────────────────────────────────────────

/// Deserializable shape of the `"morphology"` section produced by `MorphologyAnalysis`.
///
/// The composable pipeline nests `target_features` and `context_features` under
/// the `"morphology"` key.
#[derive(serde::Deserialize, Default)]
#[serde(bound(deserialize = "M: serde::de::DeserializeOwned"))]
pub struct MorphSection<M> {
    #[serde(default = "Vec::new")]
    pub target_features: Vec<ExtractedFeature<M>>,
    #[serde(default = "Vec::new")]
    pub context_features: Vec<ExtractedFeature<M>>,
}

// ─── MorphologyAnalysis ───────────────────────────────────────────────────────

/// Produces morphological feature extraction for target and context words.
#[derive(Debug, Clone, Default)]
pub struct MorphologyAnalysis;

impl<L: LinguisticDefinition> crate::component::ComponentRequires<L> for MorphologyAnalysis {}

impl<L: LinguisticDefinition> AnalysisComponent<L> for MorphologyAnalysis {
    fn name(&self) -> &'static str {
        "Morphology Analysis"
    }

    fn schema_key(&self) -> &'static str {
        "morphology"
    }

    fn schema_fragment(&self, _lang: &L) -> serde_json::Value {
        let r#gen = schemars::SchemaGenerator::default();
        let feature_schema = r#gen.into_root_schema_for::<Vec<ExtractedFeature<L::Morphology>>>();
        let feature_value = serde_json::to_value(&feature_schema).unwrap();

        let mut fragment = serde_json::json!({
            "type": "object",
            "properties": {
                "target_features": feature_value,
                "context_features": feature_value
            },
            "required": ["target_features", "context_features"]
        });

        if let Some(defs) = feature_value.get("$defs") {
            fragment["$defs"] = defs.clone();
            if let Some(props) = fragment.get_mut("properties") {
                for key in ["target_features", "context_features"] {
                    if let Some(obj) = props.get_mut(key).and_then(|p| p.as_object_mut()) {
                        obj.remove("$defs");
                    }
                }
            }
        }

        fragment
    }

    fn prompt_fragment(&self, lang: &L, _ctx: &ComponentContext) -> String {
        format!(
            "Extract morphological features from every word in the sentence, following the JSON schema exactly.\n\
             Language-specific extraction directives for {}:\n{}",
            lang.name(),
            lang.extraction_directives()
        )
    }

    fn output_instruction(&self) -> Option<&str> {
        Some(concat!(
            "MORPHOLOGY RULES \n",
            "Fill the `morphology` key with exactly two lists:\n",
            "  \"target_features\"  — morphological features for each constituent word of\n",
            "                       the TARGET WORDS supplied in the user message.\n",
            "  \"context_features\" — morphological features for every OTHER word in the\n",
            "                       sentence (for grammatical context).\n\n",
            "1. The `pos` key is an internally-tagged discriminant. Use ONLY the variants\n",
            "   defined in the schema. NEVER invent variants such as \"phrase\", \"punctuation\",\n",
            "   \"clause\", or anything not listed.\n",
            "2. Punctuation tokens (`,` `.` `!` `?` `—` `«» ` etc.) must be OMITTED entirely.\n",
            "3. Every field MUST exactly match the enum variants in the schema (e.g., do not \n",
            "   use \"masculine\" as a value for a \"number\" field; use \"singular\", \"dual\", or \"plural\").\n",
            "4. Optional fields must match the schema's definition (omit if not applicable).\n",
            "5. Lemmas must always be in the dictionary (citation) form of the target language."
        ))
    }

    fn pre_process(&self, raw: &str) -> String {
        crate::text_processing::normalize_pos_tags(raw)
    }

    fn as_aggregating(&self) -> Option<&dyn Aggregating<L>> {
        Some(self)
    }
}

impl<L: LinguisticDefinition> Aggregating<L> for MorphologyAnalysis
where
    L::Morphology: Aggregable + DeserializeOwned,
{
    fn aggregate_section(
        &self,
        _lang: &L,
        section: &serde_json::Value,
        sink: &mut dyn AggregationSink,
    ) -> Result<(), AggregationError> {
        let morph: MorphSection<L::Morphology> =
            serde_json::from_value(section.clone()).map_err(|e| AggregationError::Deserialize {
                key: "morphology",
                source: e,
            })?;
        for feature in &morph.target_features {
            record_aggregable(sink, &feature.morphology);
        }
        for feature in &morph.context_features {
            record_aggregable(sink, &feature.morphology);
        }
        Ok(())
    }
}