simplecc/
lib.rs

1//! A simple Chinese Convert library (partially) compatible with
2//! [OpenCC](https://github.com/BYVoid/OpenCC/)'s 
3//! [dictionaries](https://github.com/BYVoid/OpenCC/tree/master/data/dictionary).
4//! 
5//! * Simple
6//! 
7//!   No complex configurations, all need is a text dictionary and input text.
8//! 
9//!   Built-in dictionaries included if `builtin_dicts` feature is on.
10//! 
11//! * Fast
12//! 
13//!   Use hashmap with tree structure for dictionary, faster than original
14//!   OpenCC.
15use std::{
16    collections::HashMap,
17    collections::hash_map::Entry,
18    io::{Read, BufRead, BufReader},
19    mem,
20};
21
22#[cfg(feature = "builtin_dicts")]
23pub mod dicts;
24
25#[cfg(test)]
26mod tests;
27
28/// Dictionary to convert text.
29/// 
30/// This library doesn't contain dictionary file. You may download them from
31/// [OpenCC's repo](https://github.com/BYVoid/OpenCC/tree/master/data/dictionary).
32/// 
33/// For chaining multiple dicts, just concat all to one file (in any order).
34/// 
35/// # Built-in dictionaries
36/// The library includes optional dictionaries when `builtin_dicts`
37/// feature is on. Disabled by default.
38/// 
39///
40/// # File Format
41/// The format is the same as text format (not the compiled binary one)
42/// in OpenCC project.
43/// 
44/// Specifically, one rule per line, two columns per rule splitted by a TAB
45/// (`\t`). First column is original word/phrase, the other is converted one,
46/// which may contains multiples word/phrase splitted by a space (` `), but
47/// only the first one is used, others will be ignored. 
48/// 
49#[derive(Debug, Clone)]
50pub struct Dict {
51    roots: Vec<DictNode>,
52}
53
54#[derive(Debug, Clone)]
55struct Leaf {
56    key: Box<str>,
57    value: Box<str>,
58}
59
60#[derive(Debug, Clone, Default)]
61struct Node {
62    value: Option<Box<str>>,
63    tails: HashMap<char, DictNode>,
64}
65
66#[derive(Debug, Clone)]
67enum DictNode {
68    Leaf (Leaf),
69    Node (Node),
70}
71
72impl DictNode {
73    fn node() -> Self {
74        DictNode::Node (
75            Node::default()
76        )
77    }
78
79    fn unwrap_node_mut(&mut self) -> &mut Node {
80        match self {
81            DictNode::Node (node) => node,
82            DictNode::Leaf (_) => panic!("expect Node, found Leaf"),
83        }
84
85    }
86
87    fn into_leaf(self) -> Leaf {
88        match self {
89            DictNode::Leaf (leaf) => leaf,
90            DictNode::Node (_) => panic!("expect Leaf, found Node"),
91        }
92    }
93
94    fn leaf(key: &str, value: Box<str>) -> Self {
95        DictNode::Leaf (
96            Leaf {
97                key: key.into(),
98                value,
99            }
100        )
101    }
102
103    fn add(&mut self, key: &str, value: Box<str>) {
104        let self_node = match self {
105            DictNode::Node (node) => node,
106            DictNode::Leaf (_) => {
107                let node = Node::default();
108                let leaf = mem::replace(self, DictNode::Node(node));
109                let Leaf { key, value } = leaf.into_leaf();
110                let mut node = self.unwrap_node_mut();
111                let mut key_chars = key.chars();
112                node.value = if let Some(hash_key) = key_chars.next() {
113                    let suffix = key_chars.as_str().into();
114                    node.tails.insert(hash_key, DictNode::leaf(suffix, value));
115                    None
116                } else {
117                    Some(value)
118                };
119                node
120            }
121        };
122
123        let mut key_chars = key.chars();
124        if let Some(hash_key) = key_chars.next() {
125            let suffix = key_chars.as_str().into();
126            match self_node.tails.entry(hash_key) {
127                Entry::Occupied(mut entry) => {
128                    entry.get_mut().add(suffix, value);
129                }
130                Entry::Vacant(entry) => {
131                    entry.insert(DictNode::leaf(suffix, value));
132                }
133            };
134        } else {
135            self_node.value = Some(value);
136        }
137    }
138
139    fn prefix_match<'a, 'b>(&'a self, query: &'b str)
140            -> Option<(&'b str, &'a str)> {
141        match self {
142            &DictNode::Leaf ( Leaf { ref key, ref value } ) => {
143                if query.starts_with(&**key) {
144                    Some((&query[..key.len()], &value))
145                } else {
146                    None
147                }
148            },
149            &DictNode::Node ( Node { ref value, ref tails } ) => {
150                let mut query_chars = query.chars();
151                let hash_key = query_chars.next();
152                let suffix = query_chars.as_str();
153
154                hash_key.and_then(|hash_key| {
155                    tails.get(&hash_key)
156                        .and_then(|node| node.prefix_match(suffix))
157                        .map(|(prefix, value)| {
158                            let n = query.len() - suffix.len() + prefix.len();
159                            (&query[..n], value)
160                        })
161                }).or_else(||
162                    value.as_ref().map(|v| ("", &**v))
163                )
164            }
165        }
166    }
167}
168
169impl Dict {
170    /// Load dict from string
171    pub fn load_str<T>(raw: T) -> Self
172    where T: AsRef<str> {
173        Dict::load_lines(raw.as_ref().lines())
174    }
175
176    /// Load dict from lines of string.
177    pub fn load_lines<T, S>(lines: T) -> Self
178    where T: Iterator<Item=S>,
179          S: AsRef<str> {
180        let mut root = DictNode::node();
181        lines.filter_map(|line| {
182            let mut cols = line.as_ref().splitn(2, '\t');
183            let key = cols.next()?;
184            let value = cols.next()?.splitn(2, ' ').next()?;
185            Some((key.into(), value.into()))
186        }).for_each(|(key, value): (String, Box<str>)| {
187            root.add(&key, value)
188        });
189        Dict { roots: vec![root] }
190    }
191
192    /// Load dict file.
193    /// Unrecognizable data will be silently ignored.
194    pub fn load<T>(reader: T) -> Self
195    where T: Read {
196        let lines = BufReader::new(reader).lines().filter_map(|l| l.ok());
197        Dict::load_lines(lines)
198    }
199
200    /// Return the new dict that chained together.
201    pub fn chain(self, other: Dict) -> Self {
202        let Dict { mut roots } = self;
203        roots.extend(other.roots);
204        Dict { roots }
205    }
206
207    fn replace(dict: &DictNode, mut text: &str) -> String {
208        let mut output = String::with_capacity(text.len());
209        while !text.is_empty() {
210            match dict.prefix_match(text) {
211                Some((prefix, value)) => {
212                    output.push_str(value);
213                    text = &text[prefix.len()..];
214                },
215                None => {
216                    let mut chars = text.chars();
217                    output.push(chars.next().unwrap());
218                    text = chars.as_str();
219                }
220            }
221        }
222        output
223    }
224
225    /// Use this dict to convert string.
226    /// Return converted text.
227    pub fn replace_all(&self, text: &str) -> String {
228        let mut buffer = Dict::replace(&self.roots[0], text);
229        for dict in &self.roots[1..] {
230            buffer = Dict::replace(dict, &buffer);
231        }
232        buffer
233    }
234}