Skip to main content

lindera_dictionary/builder/
character_definition.rs

1use std::borrow::Cow;
2use std::collections::{BTreeSet, HashMap};
3use std::fs::File;
4use std::io::{self, Write};
5use std::path::Path;
6
7use byteorder::{ByteOrder, LittleEndian};
8use derive_builder::Builder;
9use encoding_rs::UTF_16LE;
10use log::debug;
11
12use crate::LinderaResult;
13use crate::dictionary::character_definition::{
14    CategoryData, CategoryId, CharacterDefinition, LookupTable,
15};
16use crate::error::LinderaErrorKind;
17use crate::util::{read_file_with_encoding, write_data};
18
19const DEFAULT_CATEGORY_NAME: &str = "DEFAULT";
20
21fn ucs2_to_unicode(ucs2_codepoint: u16) -> LinderaResult<u32> {
22    let mut buf = [0u8; 2];
23    LittleEndian::write_u16(&mut buf[..], ucs2_codepoint);
24
25    let s = UTF_16LE.decode(&buf[..]).0.into_owned();
26    let chrs: Vec<char> = s.chars().collect();
27
28    match chrs.len() {
29        1 => Ok(chrs[0] as u32),
30        _ => Err(LinderaErrorKind::Parse
31            .with_error(anyhow::anyhow!("unusual char length"))
32            .add_context(format!(
33                "UCS2 codepoint 0x{:04x} resulted in {} characters",
34                ucs2_codepoint,
35                chrs.len()
36            ))),
37    }
38}
39
40fn parse_hex_codepoint(s: &str) -> LinderaResult<u32> {
41    let removed_0x = s.trim_start_matches("0x");
42    let ucs2_codepoint = u16::from_str_radix(removed_0x, 16).map_err(|err| {
43        LinderaErrorKind::Parse
44            .with_error(anyhow::anyhow!(err))
45            .add_context(format!("Invalid hexadecimal codepoint: '{s}'"))
46    })?;
47
48    ucs2_to_unicode(ucs2_codepoint)
49}
50
51#[derive(Builder, Debug)]
52#[builder(name = CharacterDefinitionBuilderOptions)]
53#[builder(build_fn(name = "builder"))]
54pub struct CharacterDefinitionBuilder {
55    #[builder(default = "\"UTF-8\".into()", setter(into))]
56    encoding: Cow<'static, str>,
57    #[builder(default = "Vec::new()")]
58    category_definition: Vec<CategoryData>,
59    #[builder(default = "HashMap::new()")]
60    category_index: HashMap<String, CategoryId>,
61    #[builder(default = "Vec::new()")]
62    char_ranges: Vec<(u32, u32, Vec<CategoryId>)>,
63}
64
65impl CharacterDefinitionBuilder {
66    pub fn category_id(&mut self, category_name: &str) -> CategoryId {
67        let num_categories = self.category_index.len();
68        *self
69            .category_index
70            .entry(category_name.to_string())
71            .or_insert(CategoryId(num_categories))
72    }
73
74    fn parse_range(&mut self, line: &str) -> LinderaResult<()> {
75        let fields: Vec<&str> = line.split_whitespace().collect();
76        let range_bounds: Vec<&str> = fields[0].split("..").collect();
77        let lower_bound: u32;
78        let higher_bound: u32;
79        match range_bounds.len() {
80            1 => {
81                lower_bound = parse_hex_codepoint(range_bounds[0])?;
82                higher_bound = lower_bound;
83            }
84            2 => {
85                lower_bound = parse_hex_codepoint(range_bounds[0])?;
86                // the right bound is included in the file.
87                higher_bound = parse_hex_codepoint(range_bounds[1])?;
88            }
89            _ => {
90                return Err(LinderaErrorKind::Content
91                    .with_error(anyhow::anyhow!("Invalid line: {line}"))
92                    .add_context(format!(
93                        "Character range should have format 'START..END' or 'SINGLE', got {} parts",
94                        range_bounds.len()
95                    )));
96            }
97        }
98        let category_ids: Vec<CategoryId> = fields[1..]
99            .iter()
100            .map(|category| self.category_id(category))
101            .collect();
102
103        self.char_ranges
104            .push((lower_bound, higher_bound, category_ids));
105
106        Ok(())
107    }
108
109    fn parse_category(&mut self, line: &str) -> LinderaResult<()> {
110        let fields = line.split_ascii_whitespace().collect::<Vec<&str>>();
111        if fields.len() != 4 {
112            return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(
113                "Expected 4 fields. Got {} in {}",
114                fields.len(),
115                line
116            )).add_context("Character category definition requires: <category_name> <invoke> <group> <length>"));
117        }
118        let invoke = fields[1].parse::<u32>().map_err(|err| {
119            LinderaErrorKind::Parse
120                .with_error(anyhow::anyhow!(err))
121                .add_context(format!(
122                    "Invalid 'invoke' field value '{}' for category '{}'",
123                    fields[1], fields[0]
124                ))
125        })? == 1;
126        let group = fields[2].parse::<u32>().map_err(|err| {
127            LinderaErrorKind::Parse
128                .with_error(anyhow::anyhow!(err))
129                .add_context(format!(
130                    "Invalid 'group' field value '{}' for category '{}'",
131                    fields[2], fields[0]
132                ))
133        })? == 1;
134        let length = fields[3].parse::<u32>().map_err(|err| {
135            LinderaErrorKind::Parse
136                .with_error(anyhow::anyhow!(err))
137                .add_context(format!(
138                    "Invalid 'length' field value '{}' for category '{}'",
139                    fields[3], fields[0]
140                ))
141        })?;
142        let category_data = CategoryData {
143            invoke,
144            group,
145            length,
146        };
147        // force a category_id allocation
148        self.category_id(fields[0]);
149        self.category_definition.push(category_data);
150
151        Ok(())
152    }
153
154    fn parse(&mut self, content: &str) -> LinderaResult<()> {
155        for line in content.lines() {
156            let line_str = line
157                .split('#')
158                .next()
159                .ok_or_else(|| {
160                    LinderaErrorKind::Parse
161                        .with_error(anyhow::anyhow!("failed to parse line"))
162                        .add_context(format!("Malformed line in character definition: '{line}'"))
163                })?
164                .trim();
165            if line_str.is_empty() {
166                continue;
167            }
168            if line_str.starts_with("0x") {
169                self.parse_range(line_str)?;
170            } else {
171                self.parse_category(line_str)?;
172            }
173        }
174        Ok(())
175    }
176
177    fn lookup_categories(&self, c: u32, categories_buffer: &mut Vec<CategoryId>) {
178        categories_buffer.clear();
179        for (start, stop, category_ids) in &self.char_ranges {
180            if *start <= c && *stop >= c {
181                for cat in category_ids {
182                    if !categories_buffer.contains(cat) {
183                        categories_buffer.push(*cat);
184                    }
185                }
186            }
187        }
188        if categories_buffer.is_empty()
189            && let Some(default_category) = self.category_index.get(DEFAULT_CATEGORY_NAME)
190        {
191            categories_buffer.push(*default_category);
192        }
193    }
194
195    fn build_lookup_table(&self) -> LookupTable<CategoryId> {
196        let boundaries_set: BTreeSet<u32> = self
197            .char_ranges
198            .iter()
199            .flat_map(|(low, high, _)| vec![*low, *high + 1u32])
200            .collect();
201        let boundaries: Vec<u32> = boundaries_set.into_iter().collect();
202        LookupTable::from_fn(boundaries, &|c, buff| self.lookup_categories(c, buff))
203    }
204
205    fn get_character_definition(&self) -> CharacterDefinition {
206        let mut category_names: Vec<String> = (0..self.category_index.len())
207            .map(|_| String::new())
208            .collect();
209        for (category_name, category_id) in &self.category_index {
210            category_names[category_id.0] = category_name.clone();
211        }
212        let mapping = self.build_lookup_table();
213        CharacterDefinition {
214            category_definitions: self.category_definition.clone(),
215            category_names,
216            mapping,
217        }
218    }
219
220    pub fn build(
221        &mut self,
222        input_dir: &Path,
223        output_dir: &Path,
224    ) -> LinderaResult<CharacterDefinition> {
225        let char_def_path = input_dir.join("char.def");
226        debug!("reading {char_def_path:?}");
227        let char_def = read_file_with_encoding(&char_def_path, &self.encoding)?;
228
229        // let mut char_definitions_builder = CharacterDefinitionsBuilder::default();
230        self.parse(&char_def)?;
231        let char_definitions = self.get_character_definition().clone();
232
233        let mut chardef_buffer = Vec::new();
234        let bytes = rkyv::to_bytes::<rkyv::rancor::Error>(&char_definitions).map_err(|err| {
235            LinderaErrorKind::Serialize
236                .with_error(anyhow::anyhow!(err))
237                .add_context("Failed to serialize character definition data")
238        })?;
239        chardef_buffer.write_all(&bytes).map_err(|err| {
240            LinderaErrorKind::Io
241                .with_error(anyhow::anyhow!(err))
242                .add_context("Failed to write character definition data to buffer")
243        })?;
244
245        let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
246        let mut wtr_chardef =
247            io::BufWriter::new(File::create(&wtr_chardef_path).map_err(|err| {
248                LinderaErrorKind::Io
249                    .with_error(anyhow::anyhow!(err))
250                    .add_context(format!(
251                        "Failed to create character definition output file: {wtr_chardef_path:?}"
252                    ))
253            })?);
254
255        write_data(&chardef_buffer, &mut wtr_chardef)?;
256
257        wtr_chardef.flush().map_err(|err| {
258            LinderaErrorKind::Io
259                .with_error(anyhow::anyhow!(err))
260                .add_context(format!(
261                    "Failed to flush character definition output file: {wtr_chardef_path:?}"
262                ))
263        })?;
264
265        Ok(char_definitions)
266    }
267}