varna 1.0.0

Varna — multilingual language engine: phoneme inventories, G2P rules, scripts, grammar, and lexicon for 50+ languages
Documentation
//! Hoosh LLM query interface — structured language queries for AI inference.
//!
//! Defines query and response types for LLM-powered language operations.
//! The consuming application routes these through hoosh's inference gateway;
//! this module defines the data contract.

use std::borrow::Cow;

use serde::{Deserialize, Serialize};

/// A structured language query to be routed through hoosh.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[non_exhaustive]
pub enum LanguageQuery {
    /// Identify the language of a text sample.
    IdentifyLanguage { text: String },
    /// Explain a phonological rule or feature.
    ExplainPhonology { language: String, topic: String },
    /// Generate example words containing a specific phoneme.
    ExamplesForPhoneme {
        language: String,
        ipa: String,
        count: u8,
    },
    /// Compare linguistic features between languages.
    ComparativeAnalysis {
        languages: Vec<String>,
        aspect: ComparisonAspect,
    },
    /// Translate/gloss a word with etymological context.
    EtymologyLookup {
        word: String,
        source_language: String,
    },
}

/// What aspect of language to compare.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ComparisonAspect {
    /// Phoneme inventory differences.
    Phonology,
    /// Morphological and syntactic typology.
    Grammar,
    /// Writing system comparison.
    Script,
    /// Vocabulary overlap and cognates.
    Lexicon,
}

/// Response from a hoosh language query.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct QueryResponse {
    /// The original query (for correlation).
    pub query_id: Option<Cow<'static, str>>,
    /// Whether the response was generated by an LLM or from varna's data.
    pub source: ResponseSource,
    /// The response content.
    pub content: String,
    /// Confidence score (0.0-1.0) if applicable.
    pub confidence: Option<f64>,
    /// Structured data extracted from the response.
    pub structured_data: Option<serde_json::Value>,
}

/// Where the response data came from.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum ResponseSource {
    /// Direct lookup from varna's data (high confidence).
    VarnaData,
    /// Generated by an LLM via hoosh.
    LlmGenerated,
    /// Combination of varna data + LLM elaboration.
    Hybrid,
}

/// Build a language identification query.
#[must_use]
pub fn identify(text: impl Into<String>) -> LanguageQuery {
    LanguageQuery::IdentifyLanguage { text: text.into() }
}

/// Build a phonology explanation query.
#[must_use]
pub fn explain_phonology(language: impl Into<String>, topic: impl Into<String>) -> LanguageQuery {
    LanguageQuery::ExplainPhonology {
        language: language.into(),
        topic: topic.into(),
    }
}

/// Build a comparative analysis query.
#[must_use]
pub fn compare(languages: Vec<String>, aspect: ComparisonAspect) -> LanguageQuery {
    LanguageQuery::ComparativeAnalysis { languages, aspect }
}

/// Attempt to answer a query using varna's built-in data (no LLM needed).
///
/// Returns `None` if the query requires LLM inference.
#[must_use]
pub fn answer_from_data(query: &LanguageQuery) -> Option<QueryResponse> {
    tracing::trace!("attempting data-only query resolution");
    match query {
        LanguageQuery::ExamplesForPhoneme { language, ipa, .. } => {
            // Check if we have the phoneme in our inventory
            let inv = crate::registry::phonemes(language)?;
            if inv.has(ipa) {
                Some(QueryResponse {
                    query_id: None,
                    source: ResponseSource::VarnaData,
                    content: format!(
                        "/{ipa}/ is present in {lang} ({c}C + {v}V inventory)",
                        lang = inv.language_name,
                        c = inv.consonant_count(),
                        v = inv.vowel_count(),
                    ),
                    confidence: Some(1.0),
                    structured_data: serde_json::to_value(inv.find(ipa)).ok(),
                })
            } else {
                Some(QueryResponse {
                    query_id: None,
                    source: ResponseSource::VarnaData,
                    content: format!(
                        "/{ipa}/ is not in the {lang} phoneme inventory",
                        lang = inv.language_name
                    ),
                    confidence: Some(1.0),
                    structured_data: None,
                })
            }
        }
        LanguageQuery::ComparativeAnalysis {
            languages,
            aspect: ComparisonAspect::Phonology,
        } if languages.len() == 2 => {
            let inv1 = crate::registry::phonemes(&languages[0])?;
            let inv2 = crate::registry::phonemes(&languages[1])?;
            Some(QueryResponse {
                query_id: None,
                source: ResponseSource::VarnaData,
                content: format!(
                    "{} has {}C+{}V, {} has {}C+{}V",
                    inv1.language_name,
                    inv1.consonant_count(),
                    inv1.vowel_count(),
                    inv2.language_name,
                    inv2.consonant_count(),
                    inv2.vowel_count(),
                ),
                confidence: Some(1.0),
                structured_data: None,
            })
        }
        _ => None, // needs LLM
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_identify_query() {
        let q = identify("hello world");
        assert!(matches!(q, LanguageQuery::IdentifyLanguage { .. }));
    }

    #[test]
    fn test_explain_query() {
        let q = explain_phonology("en", "vowel reduction");
        assert!(matches!(q, LanguageQuery::ExplainPhonology { .. }));
    }

    #[test]
    fn test_compare_query() {
        let q = compare(vec!["en".into(), "de".into()], ComparisonAspect::Phonology);
        assert!(matches!(q, LanguageQuery::ComparativeAnalysis { .. }));
    }

    #[test]
    fn test_answer_from_data_phoneme_exists() {
        let q = LanguageQuery::ExamplesForPhoneme {
            language: "en".into(),
            ipa: "θ".into(),
            count: 3,
        };
        let resp = answer_from_data(&q).unwrap();
        assert_eq!(resp.source, ResponseSource::VarnaData);
        assert!(resp.content.contains("present"));
        assert_eq!(resp.confidence, Some(1.0));
    }

    #[test]
    fn test_answer_from_data_phoneme_missing() {
        let q = LanguageQuery::ExamplesForPhoneme {
            language: "en".into(),
            ipa: "ʀ".into(),
            count: 3,
        };
        let resp = answer_from_data(&q).unwrap();
        assert!(resp.content.contains("not in"));
    }

    #[test]
    fn test_answer_from_data_comparison() {
        let q = compare(vec!["en".into(), "ja".into()], ComparisonAspect::Phonology);
        let resp = answer_from_data(&q).unwrap();
        assert!(resp.content.contains("English"));
        assert!(resp.content.contains("Japanese"));
    }

    #[test]
    fn test_answer_from_data_needs_llm() {
        let q = identify("some text");
        assert!(answer_from_data(&q).is_none());
    }

    #[test]
    fn test_query_serde_roundtrip() {
        let q = explain_phonology("ru", "palatalization");
        let json = serde_json::to_string(&q).unwrap();
        let back: LanguageQuery = serde_json::from_str(&json).unwrap();
        assert_eq!(q, back);
    }

    #[test]
    fn test_response_serde_roundtrip() {
        let resp = QueryResponse {
            query_id: Some(Cow::Borrowed("test-1")),
            source: ResponseSource::VarnaData,
            content: "test response".into(),
            confidence: Some(0.95),
            structured_data: None,
        };
        let json = serde_json::to_string(&resp).unwrap();
        let back: QueryResponse = serde_json::from_str(&json).unwrap();
        assert_eq!(resp.source, back.source);
        assert_eq!(resp.content, back.content);
    }
}