lindera_dictionary/builder/
character_definition.rs1use std::borrow::Cow;
2use std::collections::{BTreeSet, HashMap};
3use std::fs::File;
4use std::io::{self, Write};
5use std::path::Path;
6
7use byteorder::{ByteOrder, LittleEndian};
8use derive_builder::Builder;
9use encoding_rs::UTF_16LE;
10use log::debug;
11
12use crate::LinderaResult;
13use crate::dictionary::character_definition::{
14 CategoryData, CategoryId, CharacterDefinition, LookupTable,
15};
16use crate::error::LinderaErrorKind;
17use crate::util::{read_file_with_encoding, write_data};
18
19const DEFAULT_CATEGORY_NAME: &str = "DEFAULT";
20
21fn ucs2_to_unicode(ucs2_codepoint: u16) -> LinderaResult<u32> {
22 let mut buf = [0u8; 2];
23 LittleEndian::write_u16(&mut buf[..], ucs2_codepoint);
24
25 let s = UTF_16LE.decode(&buf[..]).0.into_owned();
26 let chrs: Vec<char> = s.chars().collect();
27
28 match chrs.len() {
29 1 => Ok(chrs[0] as u32),
30 _ => Err(LinderaErrorKind::Parse
31 .with_error(anyhow::anyhow!("unusual char length"))
32 .add_context(format!(
33 "UCS2 codepoint 0x{:04x} resulted in {} characters",
34 ucs2_codepoint,
35 chrs.len()
36 ))),
37 }
38}
39
40fn parse_hex_codepoint(s: &str) -> LinderaResult<u32> {
41 let removed_0x = s.trim_start_matches("0x");
42 let ucs2_codepoint = u16::from_str_radix(removed_0x, 16).map_err(|err| {
43 LinderaErrorKind::Parse
44 .with_error(anyhow::anyhow!(err))
45 .add_context(format!("Invalid hexadecimal codepoint: '{s}'"))
46 })?;
47
48 ucs2_to_unicode(ucs2_codepoint)
49}
50
51#[derive(Builder, Debug)]
52#[builder(name = CharacterDefinitionBuilderOptions)]
53#[builder(build_fn(name = "builder"))]
54pub struct CharacterDefinitionBuilder {
55 #[builder(default = "\"UTF-8\".into()", setter(into))]
56 encoding: Cow<'static, str>,
57 #[builder(default = "Vec::new()")]
58 category_definition: Vec<CategoryData>,
59 #[builder(default = "HashMap::new()")]
60 category_index: HashMap<String, CategoryId>,
61 #[builder(default = "Vec::new()")]
62 char_ranges: Vec<(u32, u32, Vec<CategoryId>)>,
63}
64
65impl CharacterDefinitionBuilder {
66 pub fn category_id(&mut self, category_name: &str) -> CategoryId {
67 let num_categories = self.category_index.len();
68 *self
69 .category_index
70 .entry(category_name.to_string())
71 .or_insert(CategoryId(num_categories))
72 }
73
74 fn parse_range(&mut self, line: &str) -> LinderaResult<()> {
75 let fields: Vec<&str> = line.split_whitespace().collect();
76 let range_bounds: Vec<&str> = fields[0].split("..").collect();
77 let lower_bound: u32;
78 let higher_bound: u32;
79 match range_bounds.len() {
80 1 => {
81 lower_bound = parse_hex_codepoint(range_bounds[0])?;
82 higher_bound = lower_bound;
83 }
84 2 => {
85 lower_bound = parse_hex_codepoint(range_bounds[0])?;
86 higher_bound = parse_hex_codepoint(range_bounds[1])?;
88 }
89 _ => {
90 return Err(LinderaErrorKind::Content
91 .with_error(anyhow::anyhow!("Invalid line: {line}"))
92 .add_context(format!(
93 "Character range should have format 'START..END' or 'SINGLE', got {} parts",
94 range_bounds.len()
95 )));
96 }
97 }
98 let category_ids: Vec<CategoryId> = fields[1..]
99 .iter()
100 .map(|category| self.category_id(category))
101 .collect();
102
103 self.char_ranges
104 .push((lower_bound, higher_bound, category_ids));
105
106 Ok(())
107 }
108
109 fn parse_category(&mut self, line: &str) -> LinderaResult<()> {
110 let fields = line.split_ascii_whitespace().collect::<Vec<&str>>();
111 if fields.len() != 4 {
112 return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
113 "Expected 4 fields. Got {} in {}",
114 fields.len(),
115 line
116 )).add_context("Character category definition requires: <category_name> <invoke> <group> <length>"));
117 }
118 let invoke = fields[1].parse::<u32>().map_err(|err| {
119 LinderaErrorKind::Parse
120 .with_error(anyhow::anyhow!(err))
121 .add_context(format!(
122 "Invalid 'invoke' field value '{}' for category '{}'",
123 fields[1], fields[0]
124 ))
125 })? == 1;
126 let group = fields[2].parse::<u32>().map_err(|err| {
127 LinderaErrorKind::Parse
128 .with_error(anyhow::anyhow!(err))
129 .add_context(format!(
130 "Invalid 'group' field value '{}' for category '{}'",
131 fields[2], fields[0]
132 ))
133 })? == 1;
134 let length = fields[3].parse::<u32>().map_err(|err| {
135 LinderaErrorKind::Parse
136 .with_error(anyhow::anyhow!(err))
137 .add_context(format!(
138 "Invalid 'length' field value '{}' for category '{}'",
139 fields[3], fields[0]
140 ))
141 })?;
142 let category_data = CategoryData {
143 invoke,
144 group,
145 length,
146 };
147 self.category_id(fields[0]);
149 self.category_definition.push(category_data);
150
151 Ok(())
152 }
153
154 fn parse(&mut self, content: &str) -> LinderaResult<()> {
155 for line in content.lines() {
156 let line_str = line
157 .split('#')
158 .next()
159 .ok_or_else(|| {
160 LinderaErrorKind::Parse
161 .with_error(anyhow::anyhow!("failed to parse line"))
162 .add_context(format!("Malformed line in character definition: '{line}'"))
163 })?
164 .trim();
165 if line_str.is_empty() {
166 continue;
167 }
168 if line_str.starts_with("0x") {
169 self.parse_range(line_str)?;
170 } else {
171 self.parse_category(line_str)?;
172 }
173 }
174 Ok(())
175 }
176
177 fn lookup_categories(&self, c: u32, categories_buffer: &mut Vec<CategoryId>) {
178 categories_buffer.clear();
179 for (start, stop, category_ids) in &self.char_ranges {
180 if *start <= c && *stop >= c {
181 for cat in category_ids {
182 if !categories_buffer.contains(cat) {
183 categories_buffer.push(*cat);
184 }
185 }
186 }
187 }
188 if categories_buffer.is_empty()
189 && let Some(default_category) = self.category_index.get(DEFAULT_CATEGORY_NAME)
190 {
191 categories_buffer.push(*default_category);
192 }
193 }
194
195 fn build_lookup_table(&self) -> LookupTable<CategoryId> {
196 let boundaries_set: BTreeSet<u32> = self
197 .char_ranges
198 .iter()
199 .flat_map(|(low, high, _)| vec![*low, *high + 1u32])
200 .collect();
201 let boundaries: Vec<u32> = boundaries_set.into_iter().collect();
202 LookupTable::from_fn(boundaries, &|c, buff| self.lookup_categories(c, buff))
203 }
204
205 fn get_character_definition(&self) -> CharacterDefinition {
206 let mut category_names: Vec<String> = (0..self.category_index.len())
207 .map(|_| String::new())
208 .collect();
209 for (category_name, category_id) in &self.category_index {
210 category_names[category_id.0] = category_name.clone();
211 }
212 let mapping = self.build_lookup_table();
213 CharacterDefinition {
214 category_definitions: self.category_definition.clone(),
215 category_names,
216 mapping,
217 }
218 }
219
220 pub fn build(
221 &mut self,
222 input_dir: &Path,
223 output_dir: &Path,
224 ) -> LinderaResult<CharacterDefinition> {
225 let char_def_path = input_dir.join("char.def");
226 debug!("reading {char_def_path:?}");
227 let char_def = read_file_with_encoding(&char_def_path, &self.encoding)?;
228
229 self.parse(&char_def)?;
231 let char_definitions = self.get_character_definition().clone();
232
233 let mut chardef_buffer = Vec::new();
234 let bytes = rkyv::to_bytes::<rkyv::rancor::Error>(&char_definitions).map_err(|err| {
235 LinderaErrorKind::Serialize
236 .with_error(anyhow::anyhow!(err))
237 .add_context("Failed to serialize character definition data")
238 })?;
239 chardef_buffer.write_all(&bytes).map_err(|err| {
240 LinderaErrorKind::Io
241 .with_error(anyhow::anyhow!(err))
242 .add_context("Failed to write character definition data to buffer")
243 })?;
244
245 let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
246 let mut wtr_chardef =
247 io::BufWriter::new(File::create(&wtr_chardef_path).map_err(|err| {
248 LinderaErrorKind::Io
249 .with_error(anyhow::anyhow!(err))
250 .add_context(format!(
251 "Failed to create character definition output file: {wtr_chardef_path:?}"
252 ))
253 })?);
254
255 write_data(&chardef_buffer, &mut wtr_chardef)?;
256
257 wtr_chardef.flush().map_err(|err| {
258 LinderaErrorKind::Io
259 .with_error(anyhow::anyhow!(err))
260 .add_context(format!(
261 "Failed to flush character definition output file: {wtr_chardef_path:?}"
262 ))
263 })?;
264
265 Ok(char_definitions)
266 }
267}