meta-language 0.36.0

A self-describing links-network core for lossless language representation
use std::sync::OnceLock;

use lindera::dictionary::load_dictionary;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::tokenizer::Tokenizer;
use lingua::Language::{
    Arabic, Bengali, Chinese, English, French, Hindi, Portuguese, Russian, Spanish, Urdu,
};
use lingua::{Language, LanguageDetector, LanguageDetectorBuilder};
use unicode_bidi::BidiInfo;
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;

use crate::configuration::{LanguageIdentificationDetector, ParseConfiguration};
use crate::link_network::{LinkId, LinkMetadata, LinkNetwork, LinkType};
use crate::source::{ByteRange, Point, SourceSpan};

const LINGUA_LANGUAGES: [Language; 10] = [
    English, Chinese, Hindi, Spanish, Arabic, French, Bengali, Portuguese, Russian, Urdu,
];

#[derive(Clone, Debug, PartialEq, Eq)]
struct WordSegment {
    text: String,
    range: ByteRange,
}

pub fn annotate_natural_language(
    network: &mut LinkNetwork,
    document: LinkId,
    text: &str,
    language: &str,
    configuration: ParseConfiguration,
) {
    let Some(declared_language) = canonical_natural_language(language) else {
        return;
    };

    let document_span = span_for_range(text, 0, text.len());
    let detected_language =
        identify_language(text, configuration.language_identification_detector())
            .unwrap_or(declared_language);

    let region = network.insert_link(
        [document],
        LinkMetadata::new()
            .with_link_type(LinkType::Region)
            .with_named(true)
            .with_term("natural-language region")
            .with_language(declared_language)
            .with_span(document_span),
    );
    let language_link = network.insert_link(
        [region],
        LinkMetadata::new()
            .with_link_type(LinkType::Language)
            .with_named(true)
            .with_term(detected_language)
            .with_language(detected_language)
            .with_span(document_span),
    );

    insert_semantic_annotation(
        network,
        region,
        language_link,
        detector_term(configuration.language_identification_detector()),
        detected_language,
        document_span,
    );

    let (segmentation_engine, segments) = if declared_language == "Mandarin Chinese" {
        ("lindera-jieba", lindera_segments(text))
    } else {
        ("unicode-segmentation", unicode_segments(text))
    };

    insert_semantic_annotation(
        network,
        region,
        language_link,
        format!("segmentation:{segmentation_engine}"),
        detected_language,
        document_span,
    );

    for segment in segments {
        network.insert_link(
            [region],
            LinkMetadata::new()
                .with_link_type(LinkType::Token)
                .with_named(true)
                .with_term(segment.text)
                .with_language(detected_language)
                .with_span(span_for_range(
                    text,
                    segment.range.start(),
                    segment.range.end(),
                )),
        );
    }

    insert_unicode_annotations(
        network,
        region,
        language_link,
        text,
        detected_language,
        document_span,
    );
    insert_worked_example_annotations(network, region, text, declared_language, document_span);
}

fn insert_unicode_annotations(
    network: &mut LinkNetwork,
    region: LinkId,
    language_link: LinkId,
    text: &str,
    language: &str,
    span: SourceSpan,
) {
    let nfc = text.nfc().collect::<String>();
    let nfd = text.nfd().collect::<String>();

    insert_semantic_annotation(
        network,
        region,
        language_link,
        format!("normalization:nfc:{}", normalization_status(text, &nfc)),
        language,
        span,
    );
    insert_semantic_annotation(
        network,
        region,
        language_link,
        format!("normalization:nfd:{}", normalization_status(text, &nfd)),
        language,
        span,
    );
    insert_semantic_annotation(
        network,
        region,
        language_link,
        format!("bidi:{}", bidi_direction(text)),
        language,
        span,
    );
}

fn insert_semantic_annotation(
    network: &mut LinkNetwork,
    region: LinkId,
    language_link: LinkId,
    term: impl Into<String>,
    language: &str,
    span: SourceSpan,
) -> LinkId {
    network.insert_link(
        [region, language_link],
        LinkMetadata::new()
            .with_link_type(LinkType::Semantic)
            .with_named(true)
            .with_term(term)
            .with_language(language)
            .with_span(span),
    )
}

fn insert_worked_example_annotations(
    network: &mut LinkNetwork,
    region: LinkId,
    text: &str,
    language: &str,
    span: SourceSpan,
) {
    if !is_statehood_worked_example(text, language) {
        return;
    }

    let concepts = network.seed_statehood_worked_example();
    network.insert_link(
        [
            region,
            concepts.proposition,
            concepts.subject,
            concepts.object,
        ],
        LinkMetadata::new()
            .with_link_type(LinkType::Semantic)
            .with_named(true)
            .with_term("proposition:statehood")
            .with_language(language)
            .with_span(span),
    );
}

fn is_statehood_worked_example(text: &str, language: &str) -> bool {
    match language {
        "English" => text.trim() == "Hawaii is a state.",
        "Russian" => text.trim() == "Гавайи это штат.",
        _ => false,
    }
}

const fn detector_term(detector: LanguageIdentificationDetector) -> &'static str {
    match detector {
        LanguageIdentificationDetector::Lingua => "identifier:lingua",
        LanguageIdentificationDetector::Whatlang => "identifier:whatlang",
    }
}

fn identify_language(text: &str, detector: LanguageIdentificationDetector) -> Option<&'static str> {
    match detector {
        LanguageIdentificationDetector::Lingua => identify_with_lingua(text),
        LanguageIdentificationDetector::Whatlang => identify_with_whatlang(text),
    }
}

fn identify_with_lingua(text: &str) -> Option<&'static str> {
    static DETECTOR: OnceLock<LanguageDetector> = OnceLock::new();
    let detector =
        DETECTOR.get_or_init(|| LanguageDetectorBuilder::from_languages(&LINGUA_LANGUAGES).build());
    detector
        .detect_language_of(text)
        .map(canonical_lingua_language)
}

fn identify_with_whatlang(text: &str) -> Option<&'static str> {
    canonical_whatlang_language(whatlang::detect(text)?.lang())
}

const fn canonical_lingua_language(language: Language) -> &'static str {
    match language {
        English => "English",
        Chinese => "Mandarin Chinese",
        Hindi => "Hindi",
        Spanish => "Spanish",
        Arabic => "Modern Standard Arabic",
        French => "French",
        Bengali => "Bengali",
        Portuguese => "Portuguese",
        Russian => "Russian",
        Urdu => "Urdu",
    }
}

const fn canonical_whatlang_language(language: whatlang::Lang) -> Option<&'static str> {
    match language {
        whatlang::Lang::Eng => Some("English"),
        whatlang::Lang::Cmn => Some("Mandarin Chinese"),
        whatlang::Lang::Hin => Some("Hindi"),
        whatlang::Lang::Spa => Some("Spanish"),
        whatlang::Lang::Ara => Some("Modern Standard Arabic"),
        whatlang::Lang::Fra => Some("French"),
        whatlang::Lang::Ben => Some("Bengali"),
        whatlang::Lang::Por => Some("Portuguese"),
        whatlang::Lang::Rus => Some("Russian"),
        whatlang::Lang::Urd => Some("Urdu"),
        _ => None,
    }
}

fn canonical_natural_language(language: &str) -> Option<&'static str> {
    match language.to_ascii_lowercase().as_str() {
        "english" => Some("English"),
        "mandarin" | "mandarin chinese" | "chinese" => Some("Mandarin Chinese"),
        "hindi" => Some("Hindi"),
        "spanish" => Some("Spanish"),
        "arabic" | "modern standard arabic" => Some("Modern Standard Arabic"),
        "french" => Some("French"),
        "bengali" => Some("Bengali"),
        "portuguese" => Some("Portuguese"),
        "russian" => Some("Russian"),
        "urdu" => Some("Urdu"),
        _ => None,
    }
}

fn lindera_segments(text: &str) -> Vec<WordSegment> {
    let Ok(dictionary) = load_dictionary("embedded://jieba") else {
        return Vec::new();
    };
    let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
    let tokenizer = Tokenizer::new(segmenter);
    let Ok(tokens) = tokenizer.tokenize(text) else {
        return Vec::new();
    };

    tokens
        .into_iter()
        .filter(|token| contains_word_character(token.surface.as_ref()))
        .map(|token| WordSegment {
            text: token.surface.into_owned(),
            range: ByteRange::new(token.byte_start, token.byte_end),
        })
        .collect()
}

fn unicode_segments(text: &str) -> Vec<WordSegment> {
    text.split_word_bound_indices()
        .filter(|(_start, segment)| contains_word_character(segment))
        .map(|(start, segment)| WordSegment {
            text: segment.to_string(),
            range: ByteRange::new(start, start + segment.len()),
        })
        .collect()
}

fn contains_word_character(text: &str) -> bool {
    text.chars().any(char::is_alphanumeric)
}

fn normalization_status(original: &str, normalized: &str) -> &'static str {
    if original == normalized {
        "stable"
    } else {
        "changes"
    }
}

fn bidi_direction(text: &str) -> &'static str {
    let bidi = BidiInfo::new(text, None);
    if bidi
        .paragraphs
        .iter()
        .any(|paragraph| paragraph.level.is_rtl())
    {
        "rtl"
    } else {
        "ltr"
    }
}

fn span_for_range(text: &str, start: usize, end: usize) -> SourceSpan {
    SourceSpan::new(
        ByteRange::new(start, end),
        point_at_byte(text, start),
        point_at_byte(text, end),
    )
}

fn point_at_byte(text: &str, byte: usize) -> Point {
    let mut row = 0;
    let mut column = 0;

    for (index, character) in text.char_indices() {
        if index >= byte {
            break;
        }
        if character == '\n' {
            row += 1;
            column = 0;
        } else {
            column += 1;
        }
    }

    Point::new(row, column)
}