rust_jieba/
lib.rs

1//! [cppjieba](https://github.com/yanyiwu/cppjieba) Rust binding
2//!
3//! ## Installation
4//!
5//! Add it to your `Cargo.toml`:
6//!
7//! ```toml
8//! [dependencies]
9//! rust-jieba = "0.1"
10//! ```
11//!
12//! ## Example
13//!
14//! ```rust
15//! extern crate rust_jieba;
16//!
17//! use rust_jieba::Jieba;
18//!
19//! fn main() {
20//!     let jieba = Jieba::from_dir("cjieba-sys/cppjieba-cabi/cppjieba/dict");
21//!     let words = jieba.cut("南京市长江大桥", true);
22//!     assert_eq!(vec!["南京市", "长江大桥"], words);
23//! }
24//! ```
25//!
26extern crate cjieba_sys;
27
28use std::slice;
29use std::path::Path;
30use std::ffi::{CString, CStr};
31
32use cjieba_sys::*;
33
34#[derive(Debug, Clone)]
35pub struct Jieba {
36    inner: *mut jieba_t,
37}
38
39/// `Jieba::tag` API return type
40#[derive(Debug, Clone, PartialEq)]
41pub struct Tag {
42    /// Word
43    pub word: String,
44    /// Flag
45    pub flag: String,
46}
47
48/// Word with weight
49#[derive(Debug, Clone, PartialEq)]
50pub struct WordWeight {
51    /// Word
52    pub word: String,
53    /// Weight
54    pub weight: f64,
55}
56
57/// Tokenize mode
58#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub enum TokenizeMode {
60    /// Default mode
61    Default,
62    /// Search mode
63    Search,
64}
65
66/// Token
67#[derive(Debug, Clone, PartialEq)]
68pub struct Token(pub String, pub usize, pub usize);
69
70impl Token {
71    /// Word of the token
72    pub fn word(&self) -> &str {
73        &self.0
74    }
75
76    /// Unicode start position of the token
77    pub fn start(&self) -> usize {
78        self.1
79    }
80
81    /// Unicode end position of the token
82    pub fn end(&self) -> usize {
83        self.2
84    }
85}
86
87impl Jieba {
88    /// Create a new instance
89    pub fn new(dict_path: &str, hmm_path: &str, user_dict_path: &str, idf_path: &str, stop_words_path: &str)
90        -> Self
91    {
92        let c_dict_path = CString::new(dict_path).unwrap();
93        let c_hmm_path = CString::new(hmm_path).unwrap();
94        let c_user_dict_path = CString::new(user_dict_path).unwrap();
95        let c_idf_path = CString::new(idf_path).unwrap();
96        let c_stop_words_path = CString::new(stop_words_path).unwrap();
97        unsafe {
98            Self {
99                inner: jieba_new(
100                    c_dict_path.as_ptr(),
101                    c_hmm_path.as_ptr(),
102                    c_user_dict_path.as_ptr(),
103                    c_idf_path.as_ptr(),
104                    c_stop_words_path.as_ptr()
105                )
106            }
107        }
108    }
109
110    /// Create a new instance from dict data  directory
111    pub fn from_dir(data_dir: &str) -> Self {
112        let data_path = Path::new(data_dir);
113        let dict_path = data_path.join("jieba.dict.utf8");
114        let hmm_path = data_path.join("hmm_model.utf8");
115        let user_dict_path = data_path.join("user.dict.utf8");
116        let idf_path = data_path.join("idf.utf8");
117        let stop_words_path = data_path.join("stop_words.utf8");
118        Self::new(
119            dict_path.to_str().unwrap(),
120            hmm_path.to_str().unwrap(),
121            user_dict_path.to_str().unwrap(),
122            idf_path.to_str().unwrap(),
123            stop_words_path.to_str().unwrap(),
124        )
125    }
126
127    /// Cut the input text
128    ///
129    /// ## Params
130    ///
131    /// `text`: input text
132    ///
133    /// `hmm`: enable HMM or not
134    pub fn cut(&self, text: &str, hmm: bool) -> Vec<String> {
135        let c_text = CString::new(text).unwrap();
136        let is_hmm = if hmm { 1 } else { 0 };
137        unsafe {
138            let ret = jieba_cut(self.inner, c_text.as_ptr(), is_hmm);
139            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
140            let words = c_words.into_iter().map(|s| {
141                let word = CStr::from_ptr(*s);
142                word.to_string_lossy().into_owned()
143            }).collect();
144            jieba_words_free(ret);
145            words
146        }
147    }
148
149    /// Cut the input text, return all possible words
150    ///
151    /// ## Params
152    ///
153    /// `text`: input text
154    pub fn cut_all(&self, text: &str) -> Vec<String> {
155        let c_text = CString::new(text).unwrap();
156        unsafe {
157            let ret = jieba_cut_all(self.inner, c_text.as_ptr());
158            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
159            let words = c_words.into_iter().map(|s| {
160                let word = CStr::from_ptr(*s);
161                word.to_string_lossy().into_owned()
162            }).collect();
163            jieba_words_free(ret);
164            words
165        }
166    }
167
168    /// Cut the input text in search mode
169    ///
170    /// ## Params
171    ///
172    /// `text`: input text
173    ///
174    /// `hmm`: enable HMM or not
175    pub fn cut_for_search(&self, text: &str, hmm: bool) -> Vec<String> {
176        let c_text = CString::new(text).unwrap();
177        let is_hmm = if hmm { 1 } else { 0 };
178        unsafe {
179            let ret = jieba_cut_for_search(self.inner, c_text.as_ptr(), is_hmm);
180            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
181            let words = c_words.into_iter().map(|s| {
182                let word = CStr::from_ptr(*s);
183                word.to_string_lossy().into_owned()
184            }).collect();
185            jieba_words_free(ret);
186            words
187        }
188    }
189
190    /// Cut the input text using HMM
191    ///
192    /// ## Params
193    ///
194    /// `text`: input text
195    pub fn cut_hmm(&self, text: &str) -> Vec<String> {
196        let c_text = CString::new(text).unwrap();
197        unsafe {
198            let ret = jieba_cut_hmm(self.inner, c_text.as_ptr());
199            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
200            let words = c_words.into_iter().map(|s| {
201                let word = CStr::from_ptr(*s);
202                word.to_string_lossy().into_owned()
203            }).collect();
204            jieba_words_free(ret);
205            words
206        }
207    }
208
209    /// Cut the input text but limit max word length
210    ///
211    /// ## Params
212    ///
213    /// `text`: input text
214    ///
215    /// `max_word_len`: max word length
216    pub fn cut_small(&self, text: &str, max_word_len: usize) -> Vec<String> {
217        let c_text = CString::new(text).unwrap();
218        unsafe {
219            let ret = jieba_cut_small(self.inner, c_text.as_ptr(), max_word_len);
220            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
221            let words = c_words.into_iter().map(|s| {
222                let word = CStr::from_ptr(*s);
223                word.to_string_lossy().into_owned()
224            }).collect();
225            jieba_words_free(ret);
226            words
227        }
228    }
229
230    /// Tag the input text
231    ///
232    /// ## Params
233    ///
234    /// `text`: input text
235    pub fn tag(&self, text: &str) -> Vec<Tag> {
236        let c_text = CString::new(text).unwrap();
237        unsafe {
238            let ret = jieba_tag(self.inner, c_text.as_ptr());
239            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
240            let tags = c_words.into_iter().map(|s| {
241                let word = CStr::from_ptr(*s).to_string_lossy();
242                let mut parts = word.splitn(2, '/');
243                Tag {
244                    word: parts.next().unwrap().to_string(),
245                    flag: parts.next().unwrap().to_string(),
246                }
247            }).collect();
248            jieba_words_free(ret);
249            tags
250        }
251    }
252
253    /// Look up an single word's tag
254    pub fn lookup_tag(&self, word: &str) -> String {
255        let c_word = CString::new(word).unwrap();
256        unsafe {
257            let ret = jieba_lookup_tag(self.inner, c_word.as_ptr());
258            let tag = CStr::from_ptr(ret).to_string_lossy().into_owned();
259            jieba_str_free(ret);
260            tag
261        }
262    }
263
264    /// Add user defined word
265    pub fn add_user_word(&mut self, word: &str) {
266        let c_word = CString::new(word).unwrap();
267        unsafe {
268            jieba_add_user_word(self.inner, c_word.as_ptr());
269        }
270    }
271
272    /// Tokenize
273    ///
274    /// ## Params
275    ///
276    /// `text`: input text
277    ///
278    /// `mode`: tokenize mode
279    ///
280    /// `hmm`: enable HMM or not
281    pub fn tokenize(&self, text: &str, mode: TokenizeMode, hmm: bool) -> Vec<Token> {
282        let c_text = CString::new(text).unwrap();
283        let c_mode = match mode {
284            TokenizeMode::Default => JIEBA_TOKENIZE_MODE_DEFAULT,
285            TokenizeMode::Search => JIEBA_TOKENIZE_MODE_SEARCH,
286        };
287        let is_hmm = if hmm { 1 } else { 0 };
288        let mut tokens = Vec::new();
289        unsafe {
290            let ret = jieba_tokenize(self.inner, c_text.as_ptr(), c_mode, is_hmm);
291            let mut index = 0;
292            let mut c_token = ret.offset(index);
293            while !c_token.is_null() && (*c_token).length > 0 {
294                let start = (*c_token).offset as usize;
295                let end = start + (*c_token).length as usize;
296                let word = text[start..end].to_string();
297                let unicode_start = (*c_token).unicode_offset as usize;
298                let unicode_end = unicode_start + (*c_token).unicode_length as usize;
299                tokens.push(Token(word, unicode_start, unicode_end));
300                index += 1;
301                c_token = ret.offset(index);
302            }
303            jieba_token_free(ret);
304        }
305        tokens
306    }
307
308    /// Extract keywords
309    ///
310    /// ## Params
311    ///
312    /// `text`: input text
313    ///
314    /// `top_k`: limit return keywords count
315    pub fn extract(&self, text: &str, top_k: usize) -> Vec<String> {
316        let c_text = CString::new(text).unwrap();
317        unsafe {
318            let ret = jieba_extract(self.inner, c_text.as_ptr(), top_k as i32);
319            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
320            let words = c_words.into_iter().map(|s| {
321                let word = CStr::from_ptr(*s);
322                word.to_string_lossy().into_owned()
323            }).collect();
324            jieba_words_free(ret);
325            words
326        }
327    }
328
329    /// Extract keywords with weight
330    ///
331    /// ## Params
332    ///
333    /// `text`: input text
334    ///
335    /// `top_k`: limit return keywords count
336    pub fn extract_with_weight(&self, text: &str, top_k: usize) -> Vec<WordWeight> {
337        let c_text = CString::new(text).unwrap();
338        let mut words = Vec::new();
339        unsafe {
340            let ret = jieba_extract_with_weight(self.inner, c_text.as_ptr(), top_k as i32);
341            let mut index = 0;
342            let mut c_word = ret.offset(index);
343            while !c_word.is_null() && !(*c_word).word.is_null() {
344                let word = CStr::from_ptr((*c_word).word).to_string_lossy().into_owned();
345                words.push(WordWeight {
346                    word: word,
347                    weight: (*c_word).weight
348                });
349                index += 1;
350                c_word = ret.offset(index);
351            }
352            jieba_word_weight_free(ret);
353        }
354        words
355    }
356}
357
358impl Drop for Jieba {
359    fn drop(&mut self) {
360        if !self.inner.is_null() {
361            unsafe { jieba_free(self.inner) };
362        }
363    }
364}
365
366unsafe impl Send for Jieba {}
367unsafe impl Sync for Jieba {}