mod pinyin_dict;
use jieba_rs::Jieba;
use pinyin_dict::{lookup_numbers, numbers_to_marks};
use std::sync::OnceLock;
use wasm_minimal_protocol::*;
initiate_protocol!();
static JIEBA: OnceLock<Jieba> = OnceLock::new();
fn get_jieba() -> &'static Jieba {
JIEBA.get_or_init(|| {
use ruzstd::streaming_decoder::StreamingDecoder;
use std::io::Read;
static DICT_ZSTD: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/dict.dat"));
let mut buf = Vec::new();
StreamingDecoder::new(DICT_ZSTD)
.expect("invalid zstd stream in dict.dat")
.read_to_end(&mut buf)
.expect("failed to decompress dict.dat");
Jieba::with_dict(&mut buf.as_slice())
.expect("failed to load jieba dictionary")
})
}
fn is_cjk(ch: char) -> bool {
matches!(ch as u32,
0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF | 0x20000..=0x3FFFF )
}
fn apply_style(numbers: &str, style: &str) -> String {
match style {
"numbers" | "pinyin_numbers" => numbers.to_string(),
_ => numbers_to_marks(numbers),
}
}
fn render_word(word: &str, style: &str) -> Option<Vec<String>> {
if !word.chars().any(is_cjk) {
return None;
}
let char_count = word.chars().count();
if let Some(numbers) = lookup_numbers(word) {
let syllables: Vec<String> = numbers
.split_whitespace()
.map(|s| apply_style(s, style))
.collect();
if syllables.len() == char_count {
return Some(syllables);
}
}
Some(
word.chars()
.map(|ch| {
if is_cjk(ch) {
let s = ch.to_string();
lookup_numbers(&s)
.map(|n| apply_style(n.split_whitespace().next().unwrap_or(""), style))
.unwrap_or(s)
} else {
ch.to_string()
}
})
.collect(),
)
}
pub fn to_pinyin_flat(text: &str, style: &str) -> String {
get_jieba()
.cut(text, false)
.iter()
.filter_map(|w| render_word(w, style))
.flatten()
.collect::<Vec<_>>()
.join(" ")
}
#[derive(serde::Serialize, Debug, PartialEq)]
pub struct Segment {
pub word: String,
pub pinyin: Option<Vec<String>>,
}
pub fn to_pinyin_segmented(text: &str, style: &str) -> Vec<Segment> {
get_jieba()
.cut(text, false)
.iter()
.map(|w| Segment {
word: w.to_string(),
pinyin: render_word(w, style),
})
.collect()
}
#[wasm_func]
pub fn pinyin_flat(text: &[u8], style: &[u8]) -> Vec<u8> {
let text = std::str::from_utf8(text).unwrap_or("");
let style = std::str::from_utf8(style).unwrap_or("marks");
to_pinyin_flat(text, style).into_bytes()
}
#[wasm_func]
pub fn pinyin_segmented(text: &[u8], style: &[u8]) -> Vec<u8> {
let text = std::str::from_utf8(text).unwrap_or("");
let style = std::str::from_utf8(style).unwrap_or("marks");
serde_json::to_vec(&to_pinyin_segmented(text, style))
.unwrap_or_else(|_| b"[]".to_vec())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn flat_marks_basic() {
assert_eq!(to_pinyin_flat("你好", "marks"), "nǐ hǎo");
}
#[test]
fn flat_numbers_basic() {
assert_eq!(to_pinyin_flat("你好", "numbers"), "ni3 hao3");
}
#[test]
fn flat_marks_beijing() {
assert_eq!(to_pinyin_flat("北京", "marks"), "běi jīng");
}
#[test]
fn flat_numbers_beijing() {
assert_eq!(to_pinyin_flat("北京", "numbers"), "bei3 jing1");
}
#[test]
fn heteronym_zhong_in_zhongguo() {
assert_eq!(to_pinyin_flat("中國", "marks"), "Zhōng guó");
}
#[test]
fn heteronym_le_in_kuaile() {
assert_eq!(to_pinyin_flat("快樂", "marks"), "kuài lè");
}
#[test]
fn heteronym_yue_in_yinyue() {
assert_eq!(to_pinyin_flat("音樂", "marks"), "yīn yuè");
}
#[test]
fn segmented_ziran_yuyan() {
assert_eq!(
to_pinyin_segmented("自然語言", "marks"),
vec![Segment {
word: "自然語言".to_string(),
pinyin: Some(vec![
"zì".to_string(), "rán".to_string(),
"yǔ".to_string(), "yán".to_string(),
]),
}]
);
}
#[test]
fn segmented_empty() {
assert!(to_pinyin_segmented("", "marks").is_empty());
}
#[test]
fn latin_word_pinyin_is_null() {
let segs = to_pinyin_segmented("world", "marks");
assert_eq!(segs.len(), 1);
assert_eq!(segs[0].pinyin, None);
}
#[test]
fn punctuation_pinyin_is_null() {
for token in ["!", "?", ",", ",", " ", "\n"] {
let segs = to_pinyin_segmented(token, "marks");
for seg in &segs {
assert_eq!(seg.pinyin, None,
"expected null pinyin for {:?}, got {:?}", token, seg.pinyin);
}
}
}
#[test]
fn flat_skips_non_chinese() {
assert_eq!(to_pinyin_flat("world!", "marks"), "");
assert_eq!(to_pinyin_flat("北京!world", "marks"), "běi jīng");
}
#[test]
fn unknown_style_falls_back_to_marks() {
assert_eq!(
to_pinyin_flat("好", "marks"),
to_pinyin_flat("好", "whatever")
);
}
}