1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
//! Data structures for the storage of hyphenation patterns and exceptions.

pub mod extended;
mod trie;

use std::collections::HashMap;
use std::hash::Hash;

use dictionary::trie::PrefixMatches;
pub use dictionary::trie::{Error, Trie};
use language::Language;
use parse::Parse;


#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct Locus {
    pub index : u8,
    pub value : u8,
}

/// A trie mapping hyphenation patterns to their tallies.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct Patterns {
    tallies :   Vec<Vec<Locus>>,
    automaton : Trie,
}

impl Patterns {
    pub fn from_iter<I>(iter : I) -> Result<Self, trie::Error>
        where I : IntoIterator<Item = (String, <Patterns as Parse>::Tally)>
    {
        let (kvs, tallies) = uniques(iter.into_iter());
        let automaton = Trie::from_iter(kvs.into_iter())?;
        Ok(Patterns { tallies, automaton })
    }
}

/// A specialized hashmap associating words to their known hyphenation.
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct Exceptions(pub HashMap<String, Vec<usize>>);

/// A dictionary for standard Knuth–Liang hyphenation.
///
/// It comprises the working language, the pattern and exception sets,
/// and the character boundaries for hyphenation.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct Standard {
    language :       Language,
    patterns :       Patterns,
    pub exceptions : Exceptions,
    /// The minimum number of `char`s from the start and end of a word where
    /// breaks may not occur.
    pub minima :     (usize, usize),
}


impl Standard {
    /// The language for which this dictionary can provide hyphenation.
    pub fn language(&self) -> Language { self.language }

    /// An iterator over the tallies associated to all prefixes of the query,
    /// including the query itself.
    pub fn prefix_tallies<'f, 'q>(&'f self, query : &'q [u8]) -> PrefixTallies<'f, 'q, Vec<Locus>> {
        PrefixTallies { matches : self.patterns.automaton.get_prefixes(query),
                        tallies : &self.patterns.tallies, }
    }
}

pub struct PrefixTallies<'f, 'q, T> {
    tallies : &'f [T],
    matches : PrefixMatches<'f, 'q>,
}

impl<'f, 'q, T> Iterator for PrefixTallies<'f, 'q, T> {
    type Item = &'f T;

    fn next(&mut self) -> Option<Self::Item> {
        self.matches
            .next()
            .and_then(|i| self.tallies.get(i as usize))
    }
}


/// An intermediate dictionary builder, primarily to retain field privacy in the
/// dictionary.
#[derive(Debug)]
pub struct Builder {
    pub language :   Language,
    pub patterns :   Patterns,
    pub exceptions : Exceptions,
}

impl From<Builder> for Standard {
    fn from(b : Builder) -> Standard {
        Standard { language :   b.language,
                   patterns :   b.patterns,
                   exceptions : b.exceptions,
                   minima :     b.language.minima(), }
    }
}


pub fn uniques<I, T>(iter : I) -> (Vec<(String, u64)>, Vec<T>)
    where T : Eq + Clone + Hash,
          I : Iterator<Item = (String, T)>
{
    let mut pairs = Vec::new();
    let mut tally_ids = HashMap::new();
    let mut tallies : Vec<T> = Vec::with_capacity(256);
    for (pattern, tally) in iter {
        match tally_ids.get(&tally) {
            Some(&id) => pairs.push((pattern, id)),
            None => {
                let id = tallies.len() as u64;
                tallies.push(tally.clone());
                tally_ids.insert(tally, id);
                pairs.push((pattern, id));
            }
        }
    }
    pairs.sort_by(|a, b| a.0.cmp(&b.0));
    pairs.dedup_by(|a, b| a.0 == b.0);
    (pairs, tallies)
}