hyphenation_commons/dictionary/
mod.rs

1//! Data structures for the storage of hyphenation patterns and exceptions.
2
3pub mod extended;
4mod trie;
5
6use std::collections::HashMap;
7use std::hash::Hash;
8
9use crate::dictionary::trie::PrefixMatches;
10pub use crate::dictionary::trie::{Error, Trie};
11use crate::language::Language;
12use crate::parse::Parse;
13
14
15#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
16pub struct Locus {
17    pub index : u8,
18    pub value : u8,
19}
20
21/// A trie mapping hyphenation patterns to their tallies.
22#[derive(Clone, Debug, Default, Serialize, Deserialize)]
23pub struct Patterns {
24    tallies :   Vec<Vec<Locus>>,
25    automaton : Trie,
26}
27
28impl Patterns {
29    pub fn from_iter<I>(iter : I) -> Result<Self, trie::Error>
30        where I : IntoIterator<Item = (String, <Patterns as Parse>::Tally)>
31    {
32        let (kvs, tallies) = uniques(iter.into_iter());
33        let automaton = Trie::from_iter(kvs.into_iter())?;
34        Ok(Patterns { tallies, automaton })
35    }
36}
37
38/// A specialized hashmap associating words to their known hyphenation.
39#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
40pub struct Exceptions(pub HashMap<String, Vec<usize>>);
41
42/// A dictionary for standard Knuth–Liang hyphenation.
43///
44/// It comprises the working language, the pattern and exception sets,
45/// and the character boundaries for hyphenation.
46#[derive(Clone, Debug, Serialize, Deserialize)]
47pub struct Standard {
48    language :       Language,
49    patterns :       Patterns,
50    pub exceptions : Exceptions,
51    /// The minimum number of `char`s from the start and end of a word where
52    /// breaks may not occur.
53    pub minima :     (usize, usize),
54}
55
56
57impl Standard {
58    /// The language for which this dictionary can provide hyphenation.
59    pub fn language(&self) -> Language { self.language }
60
61    /// An iterator over the tallies associated to all prefixes of the query,
62    /// including the query itself.
63    pub fn prefix_tallies<'f, 'q>(&'f self, query : &'q [u8]) -> PrefixTallies<'f, 'q, Vec<Locus>> {
64        PrefixTallies { matches : self.patterns.automaton.get_prefixes(query),
65                        tallies : &self.patterns.tallies, }
66    }
67}
68
69pub struct PrefixTallies<'f, 'q, T> {
70    tallies : &'f [T],
71    matches : PrefixMatches<'f, 'q>,
72}
73
74impl<'f, 'q, T> Iterator for PrefixTallies<'f, 'q, T> {
75    type Item = &'f T;
76
77    fn next(&mut self) -> Option<Self::Item> {
78        self.matches
79            .next()
80            .and_then(|i| self.tallies.get(i as usize))
81    }
82}
83
84
85/// An intermediate dictionary builder, primarily to retain field privacy in the
86/// dictionary.
87#[derive(Debug)]
88pub struct Builder {
89    pub language :   Language,
90    pub patterns :   Patterns,
91    pub exceptions : Exceptions,
92}
93
94impl From<Builder> for Standard {
95    fn from(b : Builder) -> Standard {
96        Standard { language :   b.language,
97                   patterns :   b.patterns,
98                   exceptions : b.exceptions,
99                   minima :     b.language.minima(), }
100    }
101}
102
103
104pub fn uniques<I, T>(iter : I) -> (Vec<(String, u64)>, Vec<T>)
105    where T : Eq + Clone + Hash,
106          I : Iterator<Item = (String, T)>
107{
108    let mut pairs = Vec::new();
109    let mut tally_ids = HashMap::new();
110    let mut tallies : Vec<T> = Vec::with_capacity(256);
111    for (pattern, tally) in iter {
112        match tally_ids.get(&tally) {
113            Some(&id) => pairs.push((pattern, id)),
114            None => {
115                let id = tallies.len() as u64;
116                tallies.push(tally.clone());
117                tally_ids.insert(tally, id);
118                pairs.push((pattern, id));
119            }
120        }
121    }
122    pairs.sort_by(|a, b| a.0.cmp(&b.0));
123    pairs.dedup_by(|a, b| a.0 == b.0);
124    (pairs, tallies)
125}