precis_tools/generators/
ucd_generator.rs

1use crate::common;
2use crate::error::Error;
3use crate::file_writer;
4use crate::generators::CodeGen;
5use crate::ucd_parsers;
6use std::collections::HashSet;
7use std::fs::File;
8use std::path::{Path, PathBuf};
9use ucd_parse::Codepoints;
10use ucd_parse::CoreProperty;
11use ucd_parse::Property;
12use ucd_parse::Script;
13use ucd_parse::UnicodeDataDecompositionTag;
14use ucd_parsers::DerivedJoiningType;
15use ucd_parsers::HangulSyllableType;
16
17fn parse_unicode_file<U: ucd_parse::UcdFile, F>(path: &Path, mut f: F) -> Result<(), Error>
18where
19    F: FnMut(&U) -> Result<(), Error>,
20{
21    let lines: Vec<U> = ucd_parse::parse(path)?;
22    for line in lines.iter() {
23        f(line)?;
24    }
25    Ok(())
26}
27
28/// Generator that aggregates other [`UcdCodeGen`] elements.
29pub struct UcdFileGen {
30    ucd_path: PathBuf,
31    generators: Vec<Box<dyn UcdCodeGen>>,
32}
33
34impl UcdFileGen {
35    /// Creates a new `UcdFileGen` element.
36    /// # Arguments:
37    /// `path` - path where `UCD` files are stored
38    pub fn new<P: AsRef<Path>>(path: P) -> Self {
39        let path = path.as_ref();
40        Self {
41            ucd_path: path.to_path_buf(),
42            generators: Vec::new(),
43        }
44    }
45
46    /// Adds a [`UcdCodeGen`] element.
47    pub fn add(&mut self, gen: Box<dyn UcdCodeGen>) {
48        self.generators.push(gen);
49    }
50}
51
52impl CodeGen for UcdFileGen {
53    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
54        let it = self.generators.iter_mut();
55        for gen in it {
56            gen.parse_unicode_file(&self.ucd_path)?;
57            gen.generate_code(file)?;
58        }
59        Ok(())
60    }
61}
62
63/// Trait implemented by all elements that are able to parse `UCD` files.
64pub trait UcdCodeGen: CodeGen {
65    /// Parses a `UCD` file.
66    /// # Arguments:
67    /// `ucd_path` - Path where `UCD` file is stored.
68    fn parse_unicode_file(&mut self, ucd_path: &Path) -> Result<(), Error>;
69}
70
71/// Generic trait used by parsers to generate code.
72pub trait UcdLineParser<U>: CodeGen {
73    /// Process an entry in the `UCD` file.
74    /// # Argument:
75    /// `line` - Represents a line in the `UCD` file.
76    fn process_entry(&mut self, line: &U) -> Result<(), Error>;
77}
78
79/// Generator that crates tables of Unicode code points as a result
80/// of parsing properties in the `UCD` files.
81pub struct UcdTableGen {
82    name: String,
83    table_name: String,
84    cps: HashSet<u32>,
85}
86
87impl UcdTableGen {
88    /// Creates a new [`UcdTableGen`]
89    pub fn new(name: &str, table_name: &str) -> Self {
90        Self {
91            name: String::from(name),
92            table_name: String::from(table_name),
93            cps: HashSet::new(),
94        }
95    }
96}
97
98impl CodeGen for UcdTableGen {
99    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
100        file_writer::generate_code_from_hashset(file, &self.table_name, &self.cps)
101    }
102}
103
104impl UcdLineParser<ucd_parsers::UnicodeData> for UcdTableGen {
105    fn process_entry(&mut self, udata: &ucd_parsers::UnicodeData) -> Result<(), Error> {
106        if self.name == udata.general_category {
107            match udata.codepoints {
108                Codepoints::Single(ref cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
109                Codepoints::Range(ref r) => common::insert_codepoint_range(r, &mut self.cps)?,
110            }
111        }
112        Ok(())
113    }
114}
115
116impl UcdLineParser<HangulSyllableType> for UcdTableGen {
117    fn process_entry(&mut self, line: &HangulSyllableType) -> Result<(), Error> {
118        if self.name == line.prop.property {
119            match line.prop.codepoints {
120                Codepoints::Single(cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
121                Codepoints::Range(r) => common::insert_codepoint_range(&r, &mut self.cps)?,
122            }
123        }
124        Ok(())
125    }
126}
127
128impl UcdLineParser<Property> for UcdTableGen {
129    fn process_entry(&mut self, line: &Property) -> Result<(), Error> {
130        if self.name == line.property {
131            match line.codepoints {
132                Codepoints::Single(cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
133                Codepoints::Range(r) => common::insert_codepoint_range(&r, &mut self.cps)?,
134            }
135        }
136        Ok(())
137    }
138}
139
140impl UcdLineParser<CoreProperty> for UcdTableGen {
141    fn process_entry(&mut self, line: &CoreProperty) -> Result<(), Error> {
142        if self.name == line.property {
143            match line.codepoints {
144                Codepoints::Single(cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
145                Codepoints::Range(r) => common::insert_codepoint_range(&r, &mut self.cps)?,
146            }
147        }
148        Ok(())
149    }
150}
151
152impl UcdLineParser<Script> for UcdTableGen {
153    fn process_entry(&mut self, line: &Script) -> Result<(), Error> {
154        if self.name == line.script {
155            match line.codepoints {
156                Codepoints::Single(ref cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
157                Codepoints::Range(ref r) => common::insert_codepoint_range(r, &mut self.cps)?,
158            }
159        }
160        Ok(())
161    }
162}
163
164impl UcdLineParser<DerivedJoiningType> for UcdTableGen {
165    fn process_entry(&mut self, line: &DerivedJoiningType) -> Result<(), Error> {
166        if self.name == line.prop.property {
167            match line.prop.codepoints {
168                Codepoints::Single(ref cp) => common::insert_codepoint(cp.value(), &mut self.cps)?,
169                Codepoints::Range(ref r) => common::insert_codepoint_range(r, &mut self.cps)?,
170            }
171        }
172        Ok(())
173    }
174}
175
176/// Aggregator of elements that implement the [`UcdLineParser`] trait.
177pub struct UnicodeGen<T: ucd_parse::UcdFile> {
178    generators: Vec<Box<dyn UcdLineParser<T>>>,
179}
180
181impl<T: ucd_parse::UcdFile> UnicodeGen<T> {
182    /// Creates a new Generator for `UCD` files
183    pub fn new() -> Self {
184        Self {
185            generators: Vec::new(),
186        }
187    }
188
189    /// Add a new `UCD` line parser
190    pub fn add(&mut self, gen: Box<dyn UcdLineParser<T>>) {
191        self.generators.push(gen);
192    }
193}
194
195impl<T: ucd_parse::UcdFile> Default for UnicodeGen<T> {
196    fn default() -> Self {
197        Self::new()
198    }
199}
200
201impl<T: ucd_parse::UcdFile> UcdCodeGen for UnicodeGen<T> {
202    fn parse_unicode_file(&mut self, ucd_path: &Path) -> Result<(), Error> {
203        parse_unicode_file(ucd_path, |line: &T| {
204            let it = self.generators.iter_mut();
205            for gen in it {
206                gen.process_entry(line)?;
207            }
208            Ok(())
209        })
210    }
211}
212
213impl<T: ucd_parse::UcdFile> CodeGen for UnicodeGen<T> {
214    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
215        let it = self.generators.iter_mut();
216        for gen in it {
217            gen.generate_code(file)?;
218        }
219        Ok(())
220    }
221}
222
223/// Generator that aggregates elements that are able to generate tables
224/// from the [`UnicodeData`](http://www.unicode.org/reports/tr44/#UnicodeData.txt) file
225pub struct GeneralCategoryGen {
226    generators: Vec<Box<dyn UcdLineParser<ucd_parsers::UnicodeData>>>,
227}
228
229impl GeneralCategoryGen {
230    /// Creates a new `GeneralCategoryGen` element.
231    pub fn new() -> Self {
232        Self {
233            generators: Vec::new(),
234        }
235    }
236
237    /// Add a new `UCD` line parser
238    pub fn add(&mut self, gen: Box<dyn UcdLineParser<ucd_parsers::UnicodeData>>) {
239        self.generators.push(gen);
240    }
241}
242
243impl Default for GeneralCategoryGen {
244    fn default() -> Self {
245        Self::new()
246    }
247}
248
249impl UcdCodeGen for GeneralCategoryGen {
250    fn parse_unicode_file(&mut self, ucd_path: &Path) -> Result<(), Error> {
251        let cps: Vec<ucd_parsers::UnicodeData> = ucd_parsers::UnicodeData::parse(ucd_path)?;
252        for udata in cps.iter() {
253            let it = self.generators.iter_mut();
254            for gen in it {
255                gen.process_entry(udata)?;
256            }
257        }
258        Ok(())
259    }
260}
261
262impl CodeGen for GeneralCategoryGen {
263    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
264        let it = self.generators.iter_mut();
265        for gen in it {
266            gen.generate_code(file)?;
267        }
268        Ok(())
269    }
270}
271
272const CANONICAL_COMBINING_CLASS_VIRAMA: u8 = 9;
273
274/// Generator that creates a table of Unicode code points
275/// with the `Virama` canonical combining class.
276pub struct ViramaTableGen {
277    table_name: String,
278    cps: HashSet<u32>,
279}
280
281impl ViramaTableGen {
282    /// Creates a new table generator for code points with the `Virama` combining class
283    pub fn new(table_name: &str) -> Self {
284        Self {
285            table_name: String::from(table_name),
286            cps: HashSet::new(),
287        }
288    }
289}
290
291impl CodeGen for ViramaTableGen {
292    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
293        file_writer::generate_code_from_hashset(file, &self.table_name, &self.cps)
294    }
295}
296
297impl UcdLineParser<ucd_parsers::UnicodeData> for ViramaTableGen {
298    fn process_entry(&mut self, udata: &ucd_parsers::UnicodeData) -> Result<(), Error> {
299        match udata.codepoints {
300            Codepoints::Range(ref r) => {
301                if udata.canonical_combining_class == CANONICAL_COMBINING_CLASS_VIRAMA {
302                    common::insert_codepoint_range(r, &mut self.cps)?;
303                }
304            }
305            Codepoints::Single(ref cp) => {
306                if udata.canonical_combining_class == CANONICAL_COMBINING_CLASS_VIRAMA {
307                    common::insert_codepoint(cp.value(), &mut self.cps)?;
308                }
309            }
310        }
311        Ok(())
312    }
313}
314
315/// Generator that creates a table of Unicode code points
316/// and their decomposition mappings.
317pub struct WidthMappingTableGen {
318    name: String,
319    vec: Vec<(Codepoints, ucd_parse::Codepoint)>,
320}
321
322impl WidthMappingTableGen {
323    /// Creates a new width mapping table generator
324    pub fn new(name: &str) -> Self {
325        Self {
326            name: String::from(name),
327            vec: Vec::new(),
328        }
329    }
330}
331
332impl UcdLineParser<ucd_parsers::UnicodeData> for WidthMappingTableGen {
333    fn process_entry(&mut self, udata: &ucd_parsers::UnicodeData) -> Result<(), Error> {
334        if udata.decomposition.len == 0 {
335            return err!("No decomposition mappings");
336        }
337
338        if let Some(tag) = &udata.decomposition.tag {
339            if *tag == UnicodeDataDecompositionTag::Wide
340                || *tag == UnicodeDataDecompositionTag::Narrow
341            {
342                self.vec
343                    .push((udata.codepoints, udata.decomposition.mapping[0]));
344            }
345        }
346        Ok(())
347    }
348}
349
350impl CodeGen for WidthMappingTableGen {
351    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
352        file_writer::generate_width_mapping_vector(file, &self.name, &self.vec)
353    }
354}
355
356/// Generator that creates a table of unassigned Unicode code points
357pub struct UnassignedTableGen {
358    name: String,
359    range: ucd_parse::CodepointRange,
360    vec: Vec<Codepoints>,
361}
362
363impl UnassignedTableGen {
364    /// Creates a new table generator for unassigned code points
365    pub fn new(table_name: &str) -> Self {
366        Self {
367            name: String::from(table_name),
368            range: ucd_parse::CodepointRange {
369                start: ucd_parse::Codepoint::from_u32(0).unwrap(),
370                end: ucd_parse::Codepoint::from_u32(0).unwrap(),
371            },
372            vec: Vec::new(),
373        }
374    }
375}
376
377impl UcdLineParser<ucd_parsers::UnicodeData> for UnassignedTableGen {
378    fn process_entry(&mut self, udata: &ucd_parsers::UnicodeData) -> Result<(), Error> {
379        match udata.codepoints {
380            Codepoints::Range(ref r) => {
381                if r.start.value() - self.range.end.value() > 0 {
382                    self.range.end = ucd_parse::Codepoint::from_u32(r.start.value() - 1)?;
383                    common::add_codepoints(&self.range, &mut self.vec);
384                }
385                self.range.start = ucd_parse::Codepoint::from_u32(r.end.value() + 1)?;
386                self.range.end = r.start;
387            }
388            Codepoints::Single(ref cp) => {
389                let next_cp = ucd_parse::Codepoint::from_u32(cp.value() + 1)?;
390                if cp.value() - self.range.end.value() != 0 {
391                    self.range.end = ucd_parse::Codepoint::from_u32(cp.value() - 1)?;
392                    common::add_codepoints(&self.range, &mut self.vec);
393                }
394
395                self.range.start = next_cp;
396                self.range.end = next_cp;
397            }
398        }
399        Ok(())
400    }
401}
402
403impl CodeGen for UnassignedTableGen {
404    fn generate_code(&mut self, file: &mut File) -> Result<(), Error> {
405        file_writer::generate_code_from_vec(file, &self.name, &self.vec)
406    }
407}