lindera_dictionary/builder/
character_definition.rs1use std::borrow::Cow;
2use std::collections::{BTreeSet, HashMap};
3use std::fs::File;
4use std::io::{self, Write};
5use std::path::Path;
6
7use byteorder::{ByteOrder, LittleEndian};
8use derive_builder::Builder;
9use encoding_rs::UTF_16LE;
10use log::debug;
11
12use crate::LinderaResult;
13use crate::decompress::Algorithm;
14use crate::dictionary::character_definition::{
15 CategoryData, CategoryId, CharacterDefinition, LookupTable,
16};
17use crate::error::LinderaErrorKind;
18use crate::util::{compress_write, read_file_with_encoding};
19
20const DEFAULT_CATEGORY_NAME: &str = "DEFAULT";
21
22fn ucs2_to_unicode(ucs2_codepoint: u16) -> LinderaResult<u32> {
23 let mut buf = [0u8; 2];
24 LittleEndian::write_u16(&mut buf[..], ucs2_codepoint);
25
26 let s = UTF_16LE.decode(&buf[..]).0.into_owned();
27 let chrs: Vec<char> = s.chars().collect();
28
29 match chrs.len() {
30 1 => Ok(chrs[0] as u32),
31 _ => Err(LinderaErrorKind::Parse
32 .with_error(anyhow::anyhow!("unusual char length"))
33 .add_context(format!(
34 "UCS2 codepoint 0x{:04x} resulted in {} characters",
35 ucs2_codepoint,
36 chrs.len()
37 ))),
38 }
39}
40
41fn parse_hex_codepoint(s: &str) -> LinderaResult<u32> {
42 let removed_0x = s.trim_start_matches("0x");
43 let ucs2_codepoint = u16::from_str_radix(removed_0x, 16).map_err(|err| {
44 LinderaErrorKind::Parse
45 .with_error(anyhow::anyhow!(err))
46 .add_context(format!("Invalid hexadecimal codepoint: '{s}'"))
47 })?;
48
49 ucs2_to_unicode(ucs2_codepoint)
50}
51
52#[derive(Builder, Debug)]
53#[builder(name = CharacterDefinitionBuilderOptions)]
54#[builder(build_fn(name = "builder"))]
55pub struct CharacterDefinitionBuilder {
56 #[builder(default = "\"UTF-8\".into()", setter(into))]
57 encoding: Cow<'static, str>,
58 #[builder(default = "Algorithm::Deflate")]
59 compress_algorithm: Algorithm,
60 #[builder(default = "Vec::new()")]
61 category_definition: Vec<CategoryData>,
62 #[builder(default = "HashMap::new()")]
63 category_index: HashMap<String, CategoryId>,
64 #[builder(default = "Vec::new()")]
65 char_ranges: Vec<(u32, u32, Vec<CategoryId>)>,
66}
67
68impl CharacterDefinitionBuilder {
69 pub fn category_id(&mut self, category_name: &str) -> CategoryId {
70 let num_categories = self.category_index.len();
71 *self
72 .category_index
73 .entry(category_name.to_string())
74 .or_insert(CategoryId(num_categories))
75 }
76
77 fn parse_range(&mut self, line: &str) -> LinderaResult<()> {
78 let fields: Vec<&str> = line.split_whitespace().collect();
79 let range_bounds: Vec<&str> = fields[0].split("..").collect();
80 let lower_bound: u32;
81 let higher_bound: u32;
82 match range_bounds.len() {
83 1 => {
84 lower_bound = parse_hex_codepoint(range_bounds[0])?;
85 higher_bound = lower_bound;
86 }
87 2 => {
88 lower_bound = parse_hex_codepoint(range_bounds[0])?;
89 higher_bound = parse_hex_codepoint(range_bounds[1])?;
91 }
92 _ => {
93 return Err(LinderaErrorKind::Content
94 .with_error(anyhow::anyhow!("Invalid line: {line}"))
95 .add_context(format!(
96 "Character range should have format 'START..END' or 'SINGLE', got {} parts",
97 range_bounds.len()
98 )));
99 }
100 }
101 let category_ids: Vec<CategoryId> = fields[1..]
102 .iter()
103 .map(|category| self.category_id(category))
104 .collect();
105
106 self.char_ranges
107 .push((lower_bound, higher_bound, category_ids));
108
109 Ok(())
110 }
111
112 fn parse_category(&mut self, line: &str) -> LinderaResult<()> {
113 let fields = line.split_ascii_whitespace().collect::<Vec<&str>>();
114 if fields.len() != 4 {
115 return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
116 "Expected 4 fields. Got {} in {}",
117 fields.len(),
118 line
119 )).add_context("Character category definition requires: <category_name> <invoke> <group> <length>"));
120 }
121 let invoke = fields[1].parse::<u32>().map_err(|err| {
122 LinderaErrorKind::Parse
123 .with_error(anyhow::anyhow!(err))
124 .add_context(format!(
125 "Invalid 'invoke' field value '{}' for category '{}'",
126 fields[1], fields[0]
127 ))
128 })? == 1;
129 let group = fields[2].parse::<u32>().map_err(|err| {
130 LinderaErrorKind::Parse
131 .with_error(anyhow::anyhow!(err))
132 .add_context(format!(
133 "Invalid 'group' field value '{}' for category '{}'",
134 fields[2], fields[0]
135 ))
136 })? == 1;
137 let length = fields[3].parse::<u32>().map_err(|err| {
138 LinderaErrorKind::Parse
139 .with_error(anyhow::anyhow!(err))
140 .add_context(format!(
141 "Invalid 'length' field value '{}' for category '{}'",
142 fields[3], fields[0]
143 ))
144 })?;
145 let category_data = CategoryData {
146 invoke,
147 group,
148 length,
149 };
150 self.category_id(fields[0]);
152 self.category_definition.push(category_data);
153
154 Ok(())
155 }
156
157 fn parse(&mut self, content: &str) -> LinderaResult<()> {
158 for line in content.lines() {
159 let line_str = line
160 .split('#')
161 .next()
162 .ok_or_else(|| {
163 LinderaErrorKind::Parse
164 .with_error(anyhow::anyhow!("failed to parse line"))
165 .add_context(format!("Malformed line in character definition: '{line}'"))
166 })?
167 .trim();
168 if line_str.is_empty() {
169 continue;
170 }
171 if line_str.starts_with("0x") {
172 self.parse_range(line_str)?;
173 } else {
174 self.parse_category(line_str)?;
175 }
176 }
177 Ok(())
178 }
179
180 fn lookup_categories(&self, c: u32, categories_buffer: &mut Vec<CategoryId>) {
181 categories_buffer.clear();
182 for (start, stop, category_ids) in &self.char_ranges {
183 if *start <= c && *stop >= c {
184 for cat in category_ids {
185 if !categories_buffer.contains(cat) {
186 categories_buffer.push(*cat);
187 }
188 }
189 }
190 }
191 if categories_buffer.is_empty()
192 && let Some(default_category) = self.category_index.get(DEFAULT_CATEGORY_NAME)
193 {
194 categories_buffer.push(*default_category);
195 }
196 }
197
198 fn build_lookup_table(&self) -> LookupTable<CategoryId> {
199 let boundaries_set: BTreeSet<u32> = self
200 .char_ranges
201 .iter()
202 .flat_map(|(low, high, _)| vec![*low, *high + 1u32])
203 .collect();
204 let boundaries: Vec<u32> = boundaries_set.into_iter().collect();
205 LookupTable::from_fn(boundaries, &|c, buff| self.lookup_categories(c, buff))
206 }
207
208 fn get_character_definition(&self) -> CharacterDefinition {
209 let mut category_names: Vec<String> = (0..self.category_index.len())
210 .map(|_| String::new())
211 .collect();
212 for (category_name, category_id) in &self.category_index {
213 category_names[category_id.0] = category_name.clone();
214 }
215 let mapping = self.build_lookup_table();
216 CharacterDefinition {
217 category_definitions: self.category_definition.clone(),
218 category_names,
219 mapping,
220 }
221 }
222
223 pub fn build(
224 &mut self,
225 input_dir: &Path,
226 output_dir: &Path,
227 ) -> LinderaResult<CharacterDefinition> {
228 let char_def_path = input_dir.join("char.def");
229 debug!("reading {char_def_path:?}");
230 let char_def = read_file_with_encoding(&char_def_path, &self.encoding)?;
231
232 self.parse(&char_def)?;
234 let char_definitions = self.get_character_definition().clone();
235
236 let mut chardef_buffer = Vec::new();
237 let bytes = rkyv::to_bytes::<rkyv::rancor::Error>(&char_definitions).map_err(|err| {
238 LinderaErrorKind::Serialize
239 .with_error(anyhow::anyhow!(err))
240 .add_context("Failed to serialize character definition data")
241 })?;
242 chardef_buffer.write_all(&bytes).map_err(|err| {
243 LinderaErrorKind::Io
244 .with_error(anyhow::anyhow!(err))
245 .add_context("Failed to write character definition data to buffer")
246 })?;
247
248 let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
249 let mut wtr_chardef =
250 io::BufWriter::new(File::create(&wtr_chardef_path).map_err(|err| {
251 LinderaErrorKind::Io
252 .with_error(anyhow::anyhow!(err))
253 .add_context(format!(
254 "Failed to create character definition output file: {wtr_chardef_path:?}"
255 ))
256 })?);
257
258 compress_write(&chardef_buffer, self.compress_algorithm, &mut wtr_chardef)?;
259
260 wtr_chardef.flush().map_err(|err| {
261 LinderaErrorKind::Io
262 .with_error(anyhow::anyhow!(err))
263 .add_context(format!(
264 "Failed to flush character definition output file: {wtr_chardef_path:?}"
265 ))
266 })?;
267
268 Ok(char_definitions)
269 }
270}