lindera_dictionary/builder/
character_definition.rs

1use std::borrow::Cow;
2use std::collections::{BTreeSet, HashMap};
3use std::fs::File;
4use std::io::{self, Write};
5use std::path::Path;
6
7use byteorder::{ByteOrder, LittleEndian};
8use derive_builder::Builder;
9use encoding_rs::UTF_16LE;
10use log::debug;
11
12use crate::LinderaResult;
13use crate::decompress::Algorithm;
14use crate::dictionary::character_definition::{
15    CategoryData, CategoryId, CharacterDefinition, LookupTable,
16};
17use crate::error::LinderaErrorKind;
18use crate::util::{compress_write, read_file_with_encoding};
19
20const DEFAULT_CATEGORY_NAME: &str = "DEFAULT";
21
22fn ucs2_to_unicode(ucs2_codepoint: u16) -> LinderaResult<u32> {
23    let mut buf = [0u8; 2];
24    LittleEndian::write_u16(&mut buf[..], ucs2_codepoint);
25
26    let s = UTF_16LE.decode(&buf[..]).0.into_owned();
27    let chrs: Vec<char> = s.chars().collect();
28
29    match chrs.len() {
30        1 => Ok(chrs[0] as u32),
31        _ => Err(LinderaErrorKind::Parse
32            .with_error(anyhow::anyhow!("unusual char length"))
33            .add_context(format!(
34                "UCS2 codepoint 0x{:04x} resulted in {} characters",
35                ucs2_codepoint,
36                chrs.len()
37            ))),
38    }
39}
40
41fn parse_hex_codepoint(s: &str) -> LinderaResult<u32> {
42    let removed_0x = s.trim_start_matches("0x");
43    let ucs2_codepoint = u16::from_str_radix(removed_0x, 16).map_err(|err| {
44        LinderaErrorKind::Parse
45            .with_error(anyhow::anyhow!(err))
46            .add_context(format!("Invalid hexadecimal codepoint: '{s}'"))
47    })?;
48
49    ucs2_to_unicode(ucs2_codepoint)
50}
51
52#[derive(Builder, Debug)]
53#[builder(name = CharacterDefinitionBuilderOptions)]
54#[builder(build_fn(name = "builder"))]
55pub struct CharacterDefinitionBuilder {
56    #[builder(default = "\"UTF-8\".into()", setter(into))]
57    encoding: Cow<'static, str>,
58    #[builder(default = "Algorithm::Deflate")]
59    compress_algorithm: Algorithm,
60    #[builder(default = "Vec::new()")]
61    category_definition: Vec<CategoryData>,
62    #[builder(default = "HashMap::new()")]
63    category_index: HashMap<String, CategoryId>,
64    #[builder(default = "Vec::new()")]
65    char_ranges: Vec<(u32, u32, Vec<CategoryId>)>,
66}
67
68impl CharacterDefinitionBuilder {
69    pub fn category_id(&mut self, category_name: &str) -> CategoryId {
70        let num_categories = self.category_index.len();
71        *self
72            .category_index
73            .entry(category_name.to_string())
74            .or_insert(CategoryId(num_categories))
75    }
76
77    fn parse_range(&mut self, line: &str) -> LinderaResult<()> {
78        let fields: Vec<&str> = line.split_whitespace().collect();
79        let range_bounds: Vec<&str> = fields[0].split("..").collect();
80        let lower_bound: u32;
81        let higher_bound: u32;
82        match range_bounds.len() {
83            1 => {
84                lower_bound = parse_hex_codepoint(range_bounds[0])?;
85                higher_bound = lower_bound;
86            }
87            2 => {
88                lower_bound = parse_hex_codepoint(range_bounds[0])?;
89                // the right bound is included in the file.
90                higher_bound = parse_hex_codepoint(range_bounds[1])?;
91            }
92            _ => {
93                return Err(LinderaErrorKind::Content
94                    .with_error(anyhow::anyhow!("Invalid line: {line}"))
95                    .add_context(format!(
96                        "Character range should have format 'START..END' or 'SINGLE', got {} parts",
97                        range_bounds.len()
98                    )));
99            }
100        }
101        let category_ids: Vec<CategoryId> = fields[1..]
102            .iter()
103            .map(|category| self.category_id(category))
104            .collect();
105
106        self.char_ranges
107            .push((lower_bound, higher_bound, category_ids));
108
109        Ok(())
110    }
111
112    fn parse_category(&mut self, line: &str) -> LinderaResult<()> {
113        let fields = line.split_ascii_whitespace().collect::<Vec<&str>>();
114        if fields.len() != 4 {
115            return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
116                "Expected 4 fields. Got {} in {}",
117                fields.len(),
118                line
119            )).add_context("Character category definition requires: <category_name> <invoke> <group> <length>"));
120        }
121        let invoke = fields[1].parse::<u32>().map_err(|err| {
122            LinderaErrorKind::Parse
123                .with_error(anyhow::anyhow!(err))
124                .add_context(format!(
125                    "Invalid 'invoke' field value '{}' for category '{}'",
126                    fields[1], fields[0]
127                ))
128        })? == 1;
129        let group = fields[2].parse::<u32>().map_err(|err| {
130            LinderaErrorKind::Parse
131                .with_error(anyhow::anyhow!(err))
132                .add_context(format!(
133                    "Invalid 'group' field value '{}' for category '{}'",
134                    fields[2], fields[0]
135                ))
136        })? == 1;
137        let length = fields[3].parse::<u32>().map_err(|err| {
138            LinderaErrorKind::Parse
139                .with_error(anyhow::anyhow!(err))
140                .add_context(format!(
141                    "Invalid 'length' field value '{}' for category '{}'",
142                    fields[3], fields[0]
143                ))
144        })?;
145        let category_data = CategoryData {
146            invoke,
147            group,
148            length,
149        };
150        // force a category_id allocation
151        self.category_id(fields[0]);
152        self.category_definition.push(category_data);
153
154        Ok(())
155    }
156
157    fn parse(&mut self, content: &str) -> LinderaResult<()> {
158        for line in content.lines() {
159            let line_str = line
160                .split('#')
161                .next()
162                .ok_or_else(|| {
163                    LinderaErrorKind::Parse
164                        .with_error(anyhow::anyhow!("failed to parse line"))
165                        .add_context(format!("Malformed line in character definition: '{line}'"))
166                })?
167                .trim();
168            if line_str.is_empty() {
169                continue;
170            }
171            if line_str.starts_with("0x") {
172                self.parse_range(line_str)?;
173            } else {
174                self.parse_category(line_str)?;
175            }
176        }
177        Ok(())
178    }
179
180    fn lookup_categories(&self, c: u32, categories_buffer: &mut Vec<CategoryId>) {
181        categories_buffer.clear();
182        for (start, stop, category_ids) in &self.char_ranges {
183            if *start <= c && *stop >= c {
184                for cat in category_ids {
185                    if !categories_buffer.contains(cat) {
186                        categories_buffer.push(*cat);
187                    }
188                }
189            }
190        }
191        if categories_buffer.is_empty()
192            && let Some(default_category) = self.category_index.get(DEFAULT_CATEGORY_NAME)
193        {
194            categories_buffer.push(*default_category);
195        }
196    }
197
198    fn build_lookup_table(&self) -> LookupTable<CategoryId> {
199        let boundaries_set: BTreeSet<u32> = self
200            .char_ranges
201            .iter()
202            .flat_map(|(low, high, _)| vec![*low, *high + 1u32])
203            .collect();
204        let boundaries: Vec<u32> = boundaries_set.into_iter().collect();
205        LookupTable::from_fn(boundaries, &|c, buff| self.lookup_categories(c, buff))
206    }
207
208    fn get_character_definition(&self) -> CharacterDefinition {
209        let mut category_names: Vec<String> = (0..self.category_index.len())
210            .map(|_| String::new())
211            .collect();
212        for (category_name, category_id) in &self.category_index {
213            category_names[category_id.0] = category_name.clone();
214        }
215        let mapping = self.build_lookup_table();
216        CharacterDefinition {
217            category_definitions: self.category_definition.clone(),
218            category_names,
219            mapping,
220        }
221    }
222
223    pub fn build(
224        &mut self,
225        input_dir: &Path,
226        output_dir: &Path,
227    ) -> LinderaResult<CharacterDefinition> {
228        let char_def_path = input_dir.join("char.def");
229        debug!("reading {char_def_path:?}");
230        let char_def = read_file_with_encoding(&char_def_path, &self.encoding)?;
231
232        // let mut char_definitions_builder = CharacterDefinitionsBuilder::default();
233        self.parse(&char_def)?;
234        let char_definitions = self.get_character_definition().clone();
235
236        let mut chardef_buffer = Vec::new();
237        let bytes = rkyv::to_bytes::<rkyv::rancor::Error>(&char_definitions).map_err(|err| {
238            LinderaErrorKind::Serialize
239                .with_error(anyhow::anyhow!(err))
240                .add_context("Failed to serialize character definition data")
241        })?;
242        chardef_buffer.write_all(&bytes).map_err(|err| {
243            LinderaErrorKind::Io
244                .with_error(anyhow::anyhow!(err))
245                .add_context("Failed to write character definition data to buffer")
246        })?;
247
248        let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
249        let mut wtr_chardef =
250            io::BufWriter::new(File::create(&wtr_chardef_path).map_err(|err| {
251                LinderaErrorKind::Io
252                    .with_error(anyhow::anyhow!(err))
253                    .add_context(format!(
254                        "Failed to create character definition output file: {wtr_chardef_path:?}"
255                    ))
256            })?);
257
258        compress_write(&chardef_buffer, self.compress_algorithm, &mut wtr_chardef)?;
259
260        wtr_chardef.flush().map_err(|err| {
261            LinderaErrorKind::Io
262                .with_error(anyhow::anyhow!(err))
263                .add_context(format!(
264                    "Failed to flush character definition output file: {wtr_chardef_path:?}"
265                ))
266        })?;
267
268        Ok(char_definitions)
269    }
270}