use crate::db::{Polyphone, DB};
use crate::pinyin::{py, FinalWithTones, Initials};
use crate::Pinyin;
use nom::{
branch::alt,
bytes::complete::tag,
bytes::streaming::is_not,
character::complete::{char, hex_digit1, newline, space0},
combinator::{map_res, opt, value},
multi::{many0, separated_list1},
sequence::{pair, preceded, separated_pair, terminated, tuple},
IResult, InputTakeAtPosition,
};
use std::num::ParseIntError;
use std::str::FromStr;
fn comment(i: &str) -> IResult<&str, Option<(char, Polyphone)>> {
value(None, tuple((space0, char('#'), opt(is_not("\n")), newline)))(i)
}
fn code_point(i: &str) -> IResult<&str, char> {
preceded(
tag("U+"),
map_res(hex_digit1, |s: &str| -> Result<char, ParseIntError> {
let code_point = u32::from_str_radix(s, 16)?;
Ok(char::from_u32(code_point).unwrap())
}),
)(i)
}
fn initials(i: &str) -> IResult<&str, Initials> {
alt((
value(Initials::B, char('b')),
value(Initials::CH, tag("ch")),
value(Initials::C, char('c')),
value(Initials::D, char('d')),
value(Initials::F, char('f')),
value(Initials::G, char('g')),
value(Initials::H, char('h')),
value(Initials::J, char('j')),
value(Initials::K, char('k')),
value(Initials::L, char('l')),
value(Initials::M, char('m')),
value(Initials::N, char('n')),
value(Initials::P, char('p')),
value(Initials::Q, char('q')),
value(Initials::R, char('r')),
value(Initials::SH, tag("sh")),
value(Initials::S, char('s')),
value(Initials::T, char('t')),
value(Initials::W, char('w')),
value(Initials::X, char('x')),
alt((
value(Initials::Y, char('y')),
value(Initials::ZH, tag("zh")),
value(Initials::Z, char('z')),
)),
))(i)
}
#[rustfmt::skip]
fn is_pinyin_char(c: char) -> bool {
if c.is_ascii_alphabetic() {
true
} else {
matches!(c,
'ā' | 'á' | 'ǎ' | 'à' |
'ē' | 'é' | 'ě' | 'è' |
'ī' | 'í' | 'ǐ' | 'ì' |
'ō' | 'ó' | 'ǒ' | 'ò' |
'ū' | 'ú' | 'ǔ' | 'ù' |
'ǖ' | 'ǘ' | 'ǚ' | 'ǜ' |
'ü' |
'ń' | 'ň' | 'ǹ'
)
}
}
fn final_and_tones(i: &str) -> IResult<&str, FinalWithTones> {
let (i, s) =
i.split_at_position1_complete(|x| !is_pinyin_char(x), nom::error::ErrorKind::Alpha)?;
let r = FinalWithTones::from_str(s);
match r {
Ok(r) => Ok((i, r)),
Err(_) => Err(nom::Err::Error(nom::error::Error::new(
i,
nom::error::ErrorKind::Tag,
))),
}
}
fn pinyin(i: &str) -> IResult<&str, Pinyin> {
let (i, (initials, final_with_tones)) = pair(opt(initials), final_and_tones)(i)?;
Ok((
i,
py(
initials.unwrap_or(Initials::None),
final_with_tones.0,
final_with_tones.1,
),
))
}
fn empty_line(i: &str) -> IResult<&str, Option<(char, Polyphone)>> {
value(None, pair(space0, newline))(i)
}
fn parse_line(i: &str) -> IResult<&str, Option<(char, Polyphone)>> {
let pn_list = separated_list1(char(','), pinyin);
let char_and_pinyin = separated_pair(code_point, tag(": "), pn_list);
let mut line = terminated(char_and_pinyin, alt((comment, empty_line)));
let (remains, (ch, pinyin_list)) = line(i)?;
Ok((remains, Some((ch, pinyin_list.into()))))
}
fn parse_lines(
i: &str,
) -> Result<impl Iterator<Item = (char, Polyphone)>, nom::Err<nom::error::Error<&str>>> {
let (remains, lines) = many0(alt((empty_line, comment, parse_line)))(i)?;
if !remains.is_empty() {
panic!(
"remains: {}",
remains.chars().into_iter().take(10).collect::<String>()
);
}
Ok(lines.into_iter().flatten())
}
pub fn parse_db(i: &str) -> Result<DB, nom::error::Error<&str>> {
match parse_lines(i) {
Ok(iter) => {
let mut db = DB::new();
for (ch, polyphone) in iter {
db.insert(ch, polyphone);
}
db.shrink_to_fit();
Ok(db)
}
Err(nom::Err::Incomplete(_)) => unreachable!(),
Err(nom::Err::Error(e)) => Err(e),
Err(nom::Err::Failure(e)) => Err(e),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::pinyin::{Finals, Tones};
use rstest::rstest;
#[test]
fn parse_comment() {
assert_eq!(comment("#\n"), Ok(("", None)));
assert_eq!(comment("# Hello world\n"), Ok(("", None)));
assert_eq!(comment(" # Hello world\n"), Ok(("", None)));
assert_eq!(comment("# Hello world\r\n"), Ok(("", None)));
}
#[test]
fn parse_code_point() {
assert_eq!(code_point("U+4E2D"), Ok(("", '中')));
assert_eq!(code_point("U+2104C"), Ok(("", '𡁌')));
}
#[test]
fn parse_initials() {
assert_eq!(initials("b"), Ok(("", Initials::B)));
assert_eq!(initials("ch"), Ok(("", Initials::CH)));
assert_eq!(initials("c"), Ok(("", Initials::C)));
assert_eq!(initials("y"), Ok(("", Initials::Y)));
assert_eq!(initials("zh"), Ok(("", Initials::ZH)));
}
#[test]
fn test_is_py_chr() {
assert!(is_pinyin_char('a'));
assert!(is_pinyin_char('á'));
assert!(is_pinyin_char('ǎ'));
assert!(is_pinyin_char('ǔ'));
assert!(is_pinyin_char('ǜ'));
assert!(is_pinyin_char('ú'));
}
#[rstest]
#[case("a", Finals::A, Tones::None)]
#[case("á", Finals::A, Tones::Two)]
#[case("ang", Finals::Ang, Tones::None)]
#[case("ǎn", Finals::AN, Tones::Three)]
#[case("ún", Finals::UN, Tones::Two)]
fn parse_final_and_tones(#[case] s: &str, #[case] finals: Finals, #[case] tones: Tones) {
assert_eq!(final_and_tones(s), Ok(("", FinalWithTones(finals, tones))));
}
#[rstest]
#[case("a", Initials::None, Finals::A, Tones::None)]
#[case("bá", Initials::B, Finals::A, Tones::Two)]
#[case("hún", Initials::H, Finals::UN, Tones::Two)]
fn parse_pinyin(
#[case] s: &str,
#[case] initials: Initials,
#[case] finals: Finals,
#[case] tones: Tones,
) {
assert_eq!(pinyin(s), Ok(("", py(initials, finals, tones))));
}
#[test]
fn test_parse_line() {
assert_eq!(
parse_line("U+4E2D: zhōng\n"),
Ok((
"",
Some((
'中',
Polyphone::from(vec![py(Initials::ZH, Finals::Ong, Tones::One)])
))
))
);
assert_eq!(
parse_line("U+4E2D: zhōng,zhòng\n"),
Ok((
"",
Some((
'中',
Polyphone::from(vec![
py(Initials::ZH, Finals::Ong, Tones::One),
py(Initials::ZH, Finals::Ong, Tones::Four)
])
))
))
);
assert_eq!(
parse_line("U+4E2D: zhōng,zhòng # comment\n"),
Ok((
"",
Some((
'中',
Polyphone::from(vec![
py(Initials::ZH, Finals::Ong, Tones::One),
py(Initials::ZH, Finals::Ong, Tones::Four),
])
))
))
);
}
#[test]
fn parse_empty_line() {
assert_eq!(empty_line("\n"), Ok(("", None)));
assert_eq!(empty_line(" \t\n"), Ok(("", None)));
assert!(matches!(empty_line("foo"), Err(_)));
}
#[test]
fn test_parse_lines() {
let parse = |s: &str| parse_lines(s).unwrap().collect::<Vec<_>>();
assert_eq!(parse(""), vec![]);
assert_eq!(parse("\n"), vec![]);
assert_eq!(
parse(
r#" # comment
U+4E2D: zhōng
U+3007: líng,yuán,xīng # 〇
"#
),
vec![
(
'中',
Polyphone(py(Initials::ZH, Finals::Ong, Tones::One).into(), 0, 0)
),
(
'〇',
Polyphone(
py(Initials::L, Finals::Ing, Tones::Two).into(),
py(Initials::Y, Finals::Uan, Tones::Two).into(),
py(Initials::X, Finals::Ing, Tones::One).into()
)
),
]
);
}
#[test]
fn test_parse_db() {
let count = parse_lines(include_str!("../pinyin.txt")).unwrap().count();
assert!(count > 20902, "count = {}", count);
let db = parse_db(include_str!("../pinyin.txt")).unwrap();
let po: Pinyin = db.get('𰻞').unwrap().into();
assert_eq!(po, py(Initials::B, Finals::Iang, Tones::Two));
}
}