1use crate::shared::{comments, decode_jis_radical};
4use encoding::{codec::japanese::EUCJPEncoding, DecoderTrap, Encoding};
5use kradical_jis::jis212_to_utf8;
6use nom::{
7 branch::alt,
8 bytes::complete::{tag, take, take_while, take_while1, take_while_m_n},
9 character::{complete::space0, is_alphanumeric, is_digit},
10 combinator::{eof, map, map_res, success, value},
11 multi::many_till,
12 sequence::{pair, separated_pair, terminated, tuple},
13 IResult,
14};
15use std::{path::Path, string::FromUtf8Error};
16use thiserror::Error;
17use unicode_segmentation::UnicodeSegmentation;
18
19#[cfg(test)]
20mod tests;
21
22#[derive(Debug, Error)]
24pub enum RadkError {
25 #[error("Could not parse number of strokes as u8")]
27 Strokes,
28
29 #[error("Could not parse alternate representation as a glyph")]
31 NotGlyph,
32
33 #[error("Invalid kanji line")]
35 EucJp,
36
37 #[error("Error while parsing kradfile")]
39 Parse,
40
41 #[error("Error while reading kradfile")]
43 Io(#[from] std::io::Error),
44}
45
46#[derive(Debug, Clone, PartialEq, Eq, Hash)]
48pub struct Radical {
49 pub glyph: String,
51
52 pub strokes: u8,
54
55 pub alternate: Alternate,
57}
58
59#[derive(Debug, Clone, PartialEq, Eq, Hash)]
61pub struct Membership {
62 pub radical: Radical,
64
65 pub kanji: Vec<String>,
67}
68
69#[derive(Debug, Clone, PartialEq, Eq, Hash)]
71pub enum Alternate {
72 Image(String),
74
75 Glyph(String),
77
78 None,
80}
81
82type RadkResult = Result<Vec<Membership>, RadkError>;
83
84pub fn parse_file<P: AsRef<Path>>(path: P) -> RadkResult {
91 parse_file_implementation(path.as_ref())
92}
93
94fn parse_file_implementation(path: &Path) -> RadkResult {
96 std::fs::read(path)
97 .map_err(|err| err.into())
98 .and_then(|b| parse_bytes(&b))
99}
100
101pub fn parse_bytes(b: &[u8]) -> RadkResult {
108 lines(b).map(|(_i, o)| o).map_err(|_err| RadkError::Parse)
109}
110
111fn lines(b: &[u8]) -> IResult<&[u8], Vec<Membership>> {
112 map(many_till(kanji, eof), |(kanji, _)| kanji)(b)
113}
114
115fn kanji(b: &[u8]) -> IResult<&[u8], Membership> {
116 map(
117 pair(comments, separated_pair(ident_line, tag("\n"), kanji_lines)),
118 |(_, (ident, kanji))| Membership {
119 radical: ident,
120 kanji,
121 },
122 )(b)
123}
124
125fn kanji_lines(b: &[u8]) -> IResult<&[u8], Vec<String>> {
126 map_res(take_while(is_eucjp_or_space), from_kanji_line)(b)
127}
128
129fn is_eucjp_or_space(b: u8) -> bool {
130 b.is_ascii_whitespace() || !b.is_ascii()
131}
132
133fn from_kanji_line(b: &[u8]) -> Result<Vec<String>, RadkError> {
134 Ok(EUCJPEncoding
135 .decode(b, DecoderTrap::Replace)
136 .map_err(|_err| RadkError::EucJp)?
137 .graphemes(true)
138 .filter_map(|s| {
139 if s.chars().take(1).any(|c| c.is_ascii_whitespace()) && s.chars().count() == 1 {
140 None
141 } else {
142 Some(s.into())
143 }
144 })
145 .collect())
146}
147
148fn ident_line(b: &[u8]) -> IResult<&[u8], Radical> {
149 map(
150 tuple((ident_line_token, radical, strokes, alternate)),
151 |(_, radical, strokes, alternate)| Radical {
152 glyph: radical,
153 strokes,
154 alternate,
155 },
156 )(b)
157}
158
159fn alternate(b: &[u8]) -> IResult<&[u8], Alternate> {
160 alt((hex, image, success(Alternate::None)))(b)
161}
162
163fn image(b: &[u8]) -> IResult<&[u8], Alternate> {
164 map_res(take_while1(is_alphanumeric), from_image)(b)
165}
166
167fn from_image(b: &[u8]) -> Result<Alternate, FromUtf8Error> {
168 String::from_utf8(b.into()).map(Alternate::Image)
169}
170
171fn hex(b: &[u8]) -> IResult<&[u8], Alternate> {
172 map_res(take_while_m_n(4, 4, is_hex_digit), from_hex)(b)
173}
174
175fn from_hex(b: &[u8]) -> Result<Alternate, RadkError> {
176 let s = std::str::from_utf8(b).map_err(|_| RadkError::NotGlyph)?;
177 let code = u16::from_str_radix(&s, 16).map_err(|_| RadkError::NotGlyph)?;
178 jis212_to_utf8(code)
179 .ok_or(RadkError::NotGlyph)
180 .map(|s| Alternate::Glyph(s.to_string()))
181}
182
183fn is_hex_digit(b: u8) -> bool {
184 let c = b as char;
185 (c.is_ascii_uppercase() || c.is_ascii_digit()) && c.is_digit(16)
186}
187
188fn ident_line_token(b: &[u8]) -> IResult<&[u8], ()> {
189 terminated(value((), tag("$")), space0)(b)
190}
191
192fn radical(b: &[u8]) -> IResult<&[u8], String> {
193 terminated(map_res(take(2u8), decode_jis_radical), space0)(b)
194}
195
196fn strokes(b: &[u8]) -> IResult<&[u8], u8> {
197 terminated(map_res(take_while(is_digit), parse_number), space0)(b)
198}
199
200fn parse_number(b: &[u8]) -> Result<u8, RadkError> {
201 String::from_utf8(b.into())
202 .map_err(|_err| RadkError::Strokes)?
203 .parse()
204 .map_err(|_err| RadkError::Strokes)
205}