1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
use std::collections::BTreeMap;
use std::fs::File;
use std::io::Read;
use std::marker::PhantomData;
use std::path::Path;

use base64;
use flate2::read::GzDecoder;

use dawg::completer::Completer;
use dawg::dictionary::Dictionary;
use dawg::guide::Guide;
use dawg::value::DawgValue;

const PAYLOAD_SEPARATOR: &str = "\x01";

#[derive(Debug, Clone)]
pub struct Dawg {
    dict: Dictionary,
}

impl Dawg {
    pub fn from_file<P>(p: P) -> Self
    where
        P: AsRef<Path>,
    {
        Self::from_reader(&mut GzDecoder::new(File::open(p).unwrap()))
    }

    pub fn from_reader<T>(fp: &mut T) -> Self
    where
        T: Read,
    {
        Dawg {
            dict: Dictionary::from_reader(fp),
        }
    }

    /// Returns a list with keys of this DAWG that are prefixes of the `key`.
    pub fn prefixes<'k>(&self, key: &'k str) -> Vec<&'k str> {
        let mut result = Vec::new();
        let mut index = self.dict.root;
        for (i, &ch) in key.as_bytes().iter().enumerate() {
            index = match self.dict.follow_char(ch, index) {
                Some(v) => v,
                None => break,
            };
            if self.dict.has_value(index) {
                result.push(&key[..i + 1])
            };
        }
        result
    }

    pub fn sorted_prefixes<'k>(&self, key: &'k str) -> Vec<&'k str> {
        let mut result = self.prefixes(key);
        result.sort_by_key(|v| -(v.len() as isize));
        result
    }
}

#[derive(Debug, Clone)]
pub struct CompletionDawg<V>
where
    V: DawgValue,
{
    dawg: Dawg,
    guide: Guide,
    _phantom: PhantomData<V>,
}

impl<V> CompletionDawg<V>
where
    V: DawgValue,
{
    pub fn from_file<P>(p: P) -> Self
    where
        P: AsRef<Path>,
    {
        Self::from_reader(&mut GzDecoder::new(File::open(p).unwrap()))
    }

    pub fn from_reader<T>(fp: &mut T) -> Self
    where
        T: Read,
    {
        CompletionDawg {
            dawg: Dawg::from_reader(fp),
            guide: Guide::from_reader(fp),
            _phantom: PhantomData,
        }
    }

    /// Returns a list of (key, value) tuples for all variants of `key`
    /// in this DAWG according to `replaces`.
    ///
    /// `replaces` is an object obtained from
    /// `DAWG.compile_replaces(mapping)` where mapping is a dict
    /// that maps single-char unicode sitrings to another single-char
    /// unicode strings.
    pub fn similar_items(
        &self,
        key: &str,
        replaces: &BTreeMap<String, String>,
    ) -> Vec<(String, Vec<V>)> {
        let mut result: Vec<(String, Vec<V>)> = Vec::new();
        self.similar_items_(&mut result, "", key, self.dawg.dict.root, replaces);
        result
    }

    fn similar_items_(
        &self,
        result: &mut Vec<(String, Vec<V>)>,
        current_prefix: &str,
        key: &str,
        mut index: u32,
        replace_chars: &BTreeMap<String, String>,
    ) {
        trace!(r#"DAWG::similar_items_() index: {}"#, index);

        let start_pos = current_prefix.len();
        let subkey = &key[start_pos..];

        let mut word_pos = start_pos;

        for b_step in subkey.split("").filter(|v| !v.is_empty()) {
            trace!(r#" b_step: {:?}"#, b_step);

            if let Some(replace_char) = replace_chars.get(b_step) {
                trace!(
                    r#" b_step in replace_chars ({:?} => {:?})"#,
                    b_step,
                    replace_char
                );

                if let Some(next_index) = self.dawg.dict.follow_bytes(replace_char, index) {
                    trace!(r#" next_index: {}"#, next_index);
                    let prefix = format!(
                        "{}{}{}",
                        current_prefix,
                        &key[start_pos..word_pos],
                        replace_char
                    );
                    self.similar_items_(result, &prefix, key, next_index, replace_chars);
                };
            }
            index = match self.dawg.dict.follow_bytes(b_step, index) {
                Some(v) => v,
                None => return,
            };
            trace!(r#" index: {}"#, index);
            word_pos += b_step.len()
        }
        if let Some(index) = self.dawg.dict.follow_bytes(PAYLOAD_SEPARATOR, index) {
            trace!(r#" index: {}"#, index);
            let found_key = format!("{}{}", current_prefix, subkey);
            trace!(r#" found_key: {}"#, found_key);
            let value = self.value_for_index_(index);
            result.insert(0, (found_key, value));
        }
    }

    fn value_for_index_(&self, index: u32) -> Vec<V> {
        trace!(r#"DAWG::value_for_index_(index: {}) "#, index);
        let mut result: Vec<V> = Vec::new();
        let mut completer = Completer::new(&self.dawg.dict, &self.guide, index, "");
        while let Some(key) = completer.next_key() {
            trace!(r#"DAWG::value_for_index_(...); key: "{:?}" "#, key);
            let value = V::new_in_place(move |buf| {
                let decoded = base64::decode_config_slice(&key, base64::STANDARD, buf).unwrap();
                trace!(r#"DAWG::value_for_index_(...); bytes: {:?} "#, buf);
                assert_eq!(decoded, buf.len());
            });
            result.push(value);
        }
        result
    }

    pub fn prefixes<'k>(&self, key: &'k str) -> Vec<&'k str> {
        self.dawg.prefixes(key)
    }

    pub fn find(&self, key: &str) -> Option<u32> {
        self.dawg.dict.find(key)
    }
}