kradical_parsing/
radk.rs

1//! Parser for `radkfile` and `radkfile2`.
2
3use crate::shared::{comments, decode_jis_radical};
4use encoding::{codec::japanese::EUCJPEncoding, DecoderTrap, Encoding};
5use kradical_jis::jis212_to_utf8;
6use nom::{
7    branch::alt,
8    bytes::complete::{tag, take, take_while, take_while1, take_while_m_n},
9    character::{complete::space0, is_alphanumeric, is_digit},
10    combinator::{eof, map, map_res, success, value},
11    multi::many_till,
12    sequence::{pair, separated_pair, terminated, tuple},
13    IResult,
14};
15use std::{path::Path, string::FromUtf8Error};
16use thiserror::Error;
17use unicode_segmentation::UnicodeSegmentation;
18
19#[cfg(test)]
20mod tests;
21
22/// Enumerates the possible errors during parsing
23#[derive(Debug, Error)]
24pub enum RadkError {
25    /// Could not parse number of strokes as u8
26    #[error("Could not parse number of strokes as u8")]
27    Strokes,
28
29    /// Could not parse alternate representation as a glyph
30    #[error("Could not parse alternate representation as a glyph")]
31    NotGlyph,
32
33    /// Invalid kanji line
34    #[error("Invalid kanji line")]
35    EucJp,
36
37    /// Error while parsing kradfile
38    #[error("Error while parsing kradfile")]
39    Parse,
40
41    /// Error while reading kradfile
42    #[error("Error while reading kradfile")]
43    Io(#[from] std::io::Error),
44}
45
46/// Information about a kanji radical
47#[derive(Debug, Clone, PartialEq, Eq, Hash)]
48pub struct Radical {
49    /// The UTF-8 character most closely matching the radical
50    pub glyph: String,
51
52    /// The number of strokes used to draw the radical
53    pub strokes: u8,
54
55    /// Alternate representations for the radical
56    pub alternate: Alternate,
57}
58
59/// Describes which kanji a given radical belongs to
60#[derive(Debug, Clone, PartialEq, Eq, Hash)]
61pub struct Membership {
62    /// The radical
63    pub radical: Radical,
64
65    /// The kanji containing the radical
66    pub kanji: Vec<String>,
67}
68
69/// Alternate representations for a radical other than the UTF-8 glyph
70#[derive(Debug, Clone, PartialEq, Eq, Hash)]
71pub enum Alternate {
72    /// The name of an image from the WWWJDIC website
73    Image(String),
74
75    /// Another glyph that better depicts the radical
76    Glyph(String),
77
78    /// No alternate representation provided
79    None,
80}
81
82type RadkResult = Result<Vec<Membership>, RadkError>;
83
84/// Parses a radkfile or radkfile2 and returns
85/// the list of kanji radical memberships
86///
87/// # Arguments
88///
89/// * `path` - A path to the radkfile
90pub fn parse_file<P: AsRef<Path>>(path: P) -> RadkResult {
91    parse_file_implementation(path.as_ref())
92}
93
94// Monomorphisation bloat avoidal splitting
95fn parse_file_implementation(path: &Path) -> RadkResult {
96    std::fs::read(path)
97        .map_err(|err| err.into())
98        .and_then(|b| parse_bytes(&b))
99}
100
101/// Parses the contents of a radkfile or radkfile2 and returns
102/// the list of kanji radical memberships
103///
104/// # Arguments
105///
106/// * `b` - The bytes to parse
107pub fn parse_bytes(b: &[u8]) -> RadkResult {
108    lines(b).map(|(_i, o)| o).map_err(|_err| RadkError::Parse)
109}
110
111fn lines(b: &[u8]) -> IResult<&[u8], Vec<Membership>> {
112    map(many_till(kanji, eof), |(kanji, _)| kanji)(b)
113}
114
115fn kanji(b: &[u8]) -> IResult<&[u8], Membership> {
116    map(
117        pair(comments, separated_pair(ident_line, tag("\n"), kanji_lines)),
118        |(_, (ident, kanji))| Membership {
119            radical: ident,
120            kanji,
121        },
122    )(b)
123}
124
125fn kanji_lines(b: &[u8]) -> IResult<&[u8], Vec<String>> {
126    map_res(take_while(is_eucjp_or_space), from_kanji_line)(b)
127}
128
129fn is_eucjp_or_space(b: u8) -> bool {
130    b.is_ascii_whitespace() || !b.is_ascii()
131}
132
133fn from_kanji_line(b: &[u8]) -> Result<Vec<String>, RadkError> {
134    Ok(EUCJPEncoding
135        .decode(b, DecoderTrap::Replace)
136        .map_err(|_err| RadkError::EucJp)?
137        .graphemes(true)
138        .filter_map(|s| {
139            if s.chars().take(1).any(|c| c.is_ascii_whitespace()) && s.chars().count() == 1 {
140                None
141            } else {
142                Some(s.into())
143            }
144        })
145        .collect())
146}
147
148fn ident_line(b: &[u8]) -> IResult<&[u8], Radical> {
149    map(
150        tuple((ident_line_token, radical, strokes, alternate)),
151        |(_, radical, strokes, alternate)| Radical {
152            glyph: radical,
153            strokes,
154            alternate,
155        },
156    )(b)
157}
158
159fn alternate(b: &[u8]) -> IResult<&[u8], Alternate> {
160    alt((hex, image, success(Alternate::None)))(b)
161}
162
163fn image(b: &[u8]) -> IResult<&[u8], Alternate> {
164    map_res(take_while1(is_alphanumeric), from_image)(b)
165}
166
167fn from_image(b: &[u8]) -> Result<Alternate, FromUtf8Error> {
168    String::from_utf8(b.into()).map(Alternate::Image)
169}
170
171fn hex(b: &[u8]) -> IResult<&[u8], Alternate> {
172    map_res(take_while_m_n(4, 4, is_hex_digit), from_hex)(b)
173}
174
175fn from_hex(b: &[u8]) -> Result<Alternate, RadkError> {
176    let s = std::str::from_utf8(b).map_err(|_| RadkError::NotGlyph)?;
177    let code = u16::from_str_radix(&s, 16).map_err(|_| RadkError::NotGlyph)?;
178    jis212_to_utf8(code)
179        .ok_or(RadkError::NotGlyph)
180        .map(|s| Alternate::Glyph(s.to_string()))
181}
182
183fn is_hex_digit(b: u8) -> bool {
184    let c = b as char;
185    (c.is_ascii_uppercase() || c.is_ascii_digit()) && c.is_digit(16)
186}
187
188fn ident_line_token(b: &[u8]) -> IResult<&[u8], ()> {
189    terminated(value((), tag("$")), space0)(b)
190}
191
192fn radical(b: &[u8]) -> IResult<&[u8], String> {
193    terminated(map_res(take(2u8), decode_jis_radical), space0)(b)
194}
195
196fn strokes(b: &[u8]) -> IResult<&[u8], u8> {
197    terminated(map_res(take_while(is_digit), parse_number), space0)(b)
198}
199
200fn parse_number(b: &[u8]) -> Result<u8, RadkError> {
201    String::from_utf8(b.into())
202        .map_err(|_err| RadkError::Strokes)?
203        .parse()
204        .map_err(|_err| RadkError::Strokes)
205}