panini-lang-core 0.3.0

Core traits and types for the Panini linguistic feature extraction framework
Documentation
/// Describes whether a field is an open set (String, arbitrary values) or a closed set (enum).
#[derive(Debug, Clone)]
pub enum FieldKind {
    /// Open set — values are arbitrary (e.g. `lemma`, `base_form`).
    Open,
    /// Closed set — all possible values are known statically (e.g. `case`, `person`).
    Closed(&'static [&'static str]),
}

/// Descriptor for a single aggregable field.
#[derive(Debug, Clone)]
pub struct FieldDescriptor {
    pub name: String,
    pub kind: FieldKind,
}

/// Implemented by unit-variant enums (case, gender, tense…).
///
/// Provides the exhaustive list of variants and the serialized string for `self`.
/// Derive with `#[derive(ClosedValues)]` (generated by `panini-macro`).
pub trait ClosedValues {
    fn all_variants() -> &'static [&'static str];
    fn variant_str(&self) -> &str;
}

/// Implemented by any type that can contribute named (field, value) pairs to the digest.
///
/// Simple enums implementing `ClosedValues` get a blanket impl.
/// Complex types (e.g. `TurkishGrammaticalFunction`) implement this manually.
pub trait AggregableFields {
    fn descriptors() -> Vec<FieldDescriptor>;
    fn field_values(&self) -> Vec<(String, String)>;
}

/// Blanket impl: any `ClosedValues` type is also `AggregableFields` with a single field "value".
impl<T: ClosedValues> AggregableFields for T {
    fn descriptors() -> Vec<FieldDescriptor> {
        vec![FieldDescriptor {
            name: "value".into(),
            kind: FieldKind::Closed(T::all_variants()),
        }]
    }

    fn field_values(&self) -> Vec<(String, String)> {
        vec![("value".into(), self.variant_str().to_string())]
    }
}

/// Implement `AggregableFields` for () (used by non-agglutinative languages).
impl AggregableFields for () {
    fn descriptors() -> Vec<FieldDescriptor> {
        vec![]
    }

    fn field_values(&self) -> Vec<(String, String)> {
        vec![]
    }
}

/// Implemented by a top-level aggregation target (Morphology enum, `WordSegmentation`).
///
/// - `group_key()` — the bucket name (e.g. "Noun", "Verb", "morpheme").
/// - `instance_descriptors()` — field schema for the group (may vary per-variant for enums).
/// - `observations()` — one or more sets of (field, value) pairs per instance.
///   A `WordSegmentation` returns one observation per morpheme.
///   A morphology variant returns one observation (its own fields).
pub trait Aggregable {
    fn group_key(&self) -> String;
    fn instance_descriptors(&self) -> Vec<FieldDescriptor>;
    fn observations(&self) -> Vec<Vec<(String, String)>>;

    /// Create a wrapper that overrides the group key using a closure.
    fn pivoted<F>(&self, f: F) -> Pivoted<'_, Self, F>
    where
        Self: Sized,
        F: Fn(&Self) -> String,
    {
        Pivoted {
            inner: self,
            key_extractor: f,
        }
    }
}

/// Wrapper for an [`Aggregable`] item that overrides its group key.
pub struct Pivoted<'a, A: Aggregable, F: Fn(&A) -> String> {
    pub inner: &'a A,
    pub key_extractor: F,
}

impl<A: Aggregable, F: Fn(&A) -> String> Aggregable for Pivoted<'_, A, F> {
    fn group_key(&self) -> String {
        (self.key_extractor)(self.inner)
    }

    fn instance_descriptors(&self) -> Vec<FieldDescriptor> {
        self.inner.instance_descriptors()
    }

    fn observations(&self) -> Vec<Vec<(String, String)>> {
        self.inner.observations()
    }
}

pub mod digest;