rust-mando 0.1.2

Convert Chinese characters to pinyin with jieba word segmentation
Documentation
//! Chinese → Pīnyīn conversion with jieba word segmentation.
//!
//! # Architecture
//!
//! | Layer        | Crate / module            | Role                              |
//! |--------------|---------------------------|-----------------------------------|
//! | Segmentation | `jieba-rs`                | word boundaries + context         |
//! | Lookup       | `src/pinyin_dict.rs`      | Chinese characters → pinyin_numbers |
//! | Conversion   | `pinyin_dict::numbers_to_marks` | pinyin_numbers → pinyin_marks |
//! | Protocol     | `wasm-minimal-protocol`   | Typst WASM ABI                    |
//!
//! # Build inputs
//!
//! | File                  | Purpose                              |
//! |-----------------------|--------------------------------------|
//! | `dict/dict.txt.big`   | jieba extended segmentation dict     |
//! | `dict/cedict_ts.u8`   | CC-CEDICT source for pinyin lookup   |
//!
//! See `dict/README.md` for download instructions.

mod pinyin_dict;

use jieba_rs::Jieba;
use pinyin_dict::{lookup_numbers, numbers_to_marks};
use std::sync::OnceLock;

// ── wasm-minimal-protocol ─────────────────────────────────────────────────────

use wasm_minimal_protocol::*;
initiate_protocol!();

// ── Jieba instance ────────────────────────────────────────────────────────────

static JIEBA: OnceLock<Jieba> = OnceLock::new();

fn get_jieba() -> &'static Jieba {
    JIEBA.get_or_init(|| {
        use ruzstd::streaming_decoder::StreamingDecoder;
        use std::io::Read;
        static DICT_ZSTD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.dat"));
        let mut buf = Vec::new();
        StreamingDecoder::new(DICT_ZSTD)
            .expect("invalid zstd stream in dict.dat")
            .read_to_end(&mut buf)
            .expect("failed to decompress dict.dat");
        Jieba::with_dict(&mut buf.as_slice())
            .expect("failed to load jieba dictionary")
    })
}

// ── CJK detection ─────────────────────────────────────────────────────────────

/// Returns `true` if `ch` is in a CJK Unified Ideographs block.
fn is_cjk(ch: char) -> bool {
    matches!(ch as u32,
        0x3400..=0x4DBF   // Extension A
        | 0x4E00..=0x9FFF // Basic (most common)
        | 0xF900..=0xFAFF // Compatibility Ideographs
        | 0x20000..=0x3FFFF // Extensions B–F
    )
}

// ── style dispatch ────────────────────────────────────────────────────────────

/// Pick the right form of a pinyin string based on `style`.
/// `"numbers"` / `"pinyin_numbers"` → tone numbers; anything else → tone marks.
fn apply_style(numbers: &str, style: &str) -> String {
    match style {
        "numbers" | "pinyin_numbers" => numbers.to_string(),
        _                            => numbers_to_marks(numbers),
    }
}

// ── word rendering ────────────────────────────────────────────────────────────

/// Render one jieba segment to `Some(syllables)` or `None`.
///
/// Returns `None` for words with no CJK characters (punctuation, spaces,
/// Latin text) — these become JSON `null` in Typst output.
///
/// Strategy:
/// 1. Whole-word lookup in CC-CEDICT — accepted only when syllable count
///    matches character count (guards against partial matches).
/// 2. Per-character fallback — each CJK char looked up individually;
///    non-CJK chars within a mixed word pass through as-is.
fn render_word(word: &str, style: &str) -> Option<Vec<String>> {
    if !word.chars().any(is_cjk) {
        return None;
    }

    let char_count = word.chars().count();

    // Whole-word lookup
    if let Some(numbers) = lookup_numbers(word) {
        let syllables: Vec<String> = numbers
            .split_whitespace()
            .map(|s| apply_style(s, style))
            .collect();
        if syllables.len() == char_count {
            return Some(syllables);
        }
    }

    // Per-character fallback
    Some(
        word.chars()
            .map(|ch| {
                if is_cjk(ch) {
                    let s = ch.to_string();
                    lookup_numbers(&s)
                        .map(|n| apply_style(n.split_whitespace().next().unwrap_or(""), style))
                        .unwrap_or(s)
                } else {
                    ch.to_string()
                }
            })
            .collect(),
    )
}

// ── public Rust API ───────────────────────────────────────────────────────────

/// Space-separated pīnyīn string.
/// Non-Chinese tokens are omitted entirely.
/// `style`: `"numbers"` for tone numbers, anything else for tone marks.
pub fn to_pinyin_flat(text: &str, style: &str) -> String {
    get_jieba()
        .cut(text, false)
        .iter()
        .filter_map(|w| render_word(w, style))
        .flatten()
        .collect::<Vec<_>>()
        .join(" ")
}

/// One segment per jieba word boundary, with pīnyīn syllables.
/// `pinyin` is `None` (JSON `null`) for non-Chinese tokens.
#[derive(serde::Serialize, Debug, PartialEq)]
pub struct Segment {
    pub word:   String,
    pub pinyin: Option<Vec<String>>,
}

/// One [`Segment`] per jieba word boundary.
pub fn to_pinyin_segmented(text: &str, style: &str) -> Vec<Segment> {
    get_jieba()
        .cut(text, false)
        .iter()
        .map(|w| Segment {
            word:   w.to_string(),
            pinyin: render_word(w, style),
        })
        .collect()
}

// ── Typst / wasm-minimal-protocol exports ─────────────────────────────────────

/// Returns flat space-separated pīnyīn as UTF-8 bytes.
#[wasm_func]
pub fn pinyin_flat(text: &[u8], style: &[u8]) -> Vec<u8> {
    let text  = std::str::from_utf8(text).unwrap_or("");
    let style = std::str::from_utf8(style).unwrap_or("marks");
    to_pinyin_flat(text, style).into_bytes()
}

/// Returns JSON array `[{"word":"…","pinyin":["…"]|null},…]` as UTF-8 bytes.
#[wasm_func]
pub fn pinyin_segmented(text: &[u8], style: &[u8]) -> Vec<u8> {
    let text  = std::str::from_utf8(text).unwrap_or("");
    let style = std::str::from_utf8(style).unwrap_or("marks");
    serde_json::to_vec(&to_pinyin_segmented(text, style))
        .unwrap_or_else(|_| b"[]".to_vec())
}

// ── tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn flat_marks_basic() {
        assert_eq!(to_pinyin_flat("你好", "marks"), "nǐ hǎo");
    }

    #[test]
    fn flat_numbers_basic() {
        assert_eq!(to_pinyin_flat("你好", "numbers"), "ni3 hao3");
    }

    #[test]
    fn flat_marks_beijing() {
        assert_eq!(to_pinyin_flat("北京", "marks"), "běi jīng");
    }

    #[test]
    fn flat_numbers_beijing() {
        assert_eq!(to_pinyin_flat("北京", "numbers"), "bei3 jing1");
    }

    #[test]
    fn heteronym_zhong_in_zhongguo() {
        assert_eq!(to_pinyin_flat("中國", "marks"), "Zhōng guó");
    }

    #[test]
    fn heteronym_le_in_kuaile() {
        assert_eq!(to_pinyin_flat("快樂", "marks"), "kuài lè");
    }

    #[test]
    fn heteronym_yue_in_yinyue() {
        assert_eq!(to_pinyin_flat("音樂", "marks"), "yīn yuè");
    }

    #[test]
    fn segmented_ziran_yuyan() {
        assert_eq!(
            to_pinyin_segmented("自然語言", "marks"),
            vec![Segment {
                word:   "自然語言".to_string(),
                pinyin: Some(vec![
                    "".to_string(), "rán".to_string(),
                    "".to_string(), "yán".to_string(),
                ]),
            }]
        );
    }

    #[test]
    fn segmented_empty() {
        assert!(to_pinyin_segmented("", "marks").is_empty());
    }

    #[test]
    fn latin_word_pinyin_is_null() {
        let segs = to_pinyin_segmented("world", "marks");
        assert_eq!(segs.len(), 1);
        assert_eq!(segs[0].pinyin, None);
    }

    #[test]
    fn punctuation_pinyin_is_null() {
        for token in ["", "", "", ",", " ", "\n"] {
            let segs = to_pinyin_segmented(token, "marks");
            for seg in &segs {
                assert_eq!(seg.pinyin, None,
                    "expected null pinyin for {:?}, got {:?}", token, seg.pinyin);
            }
        }
    }

    #[test]
    fn flat_skips_non_chinese() {
        assert_eq!(to_pinyin_flat("world!", "marks"), "");
        assert_eq!(to_pinyin_flat("北京!world", "marks"), "běi jīng");
    }

    #[test]
    fn unknown_style_falls_back_to_marks() {
        assert_eq!(
            to_pinyin_flat("", "marks"),
            to_pinyin_flat("", "whatever")
        );
    }
}