1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
//! [cppjieba](https://github.com/yanyiwu/cppjieba) Rust binding
//!
//! ## Installation
//!
//! Add it to your `Cargo.toml`:
//!
//! ```toml
//! [dependencies]
//! rust-jieba = "0.1"
//! ```
//!
//! ## Example
//!
//! ```rust
//! extern crate rust_jieba;
//!
//! use rust_jieba::Jieba;
//!
//! fn main() {
//!     let jieba = Jieba::from_dir("cjieba-sys/cppjieba-cabi/cppjieba/dict");
//!     let words = jieba.cut("南京市长江大桥", true);
//!     assert_eq!(vec!["南京市", "长江大桥"], words);
//! }
//! ```
//!
extern crate cjieba_sys;

use std::slice;
use std::path::Path;
use std::ffi::{CString, CStr};

use cjieba_sys::*;

#[derive(Debug, Clone)]
pub struct Jieba {
    inner: *mut jieba_t,
}

/// `Jieba::tag` API return type
#[derive(Debug, Clone, PartialEq)]
pub struct Tag {
    /// Word
    pub word: String,
    /// Flag
    pub flag: String,
}

/// Word with weight
#[derive(Debug, Clone, PartialEq)]
pub struct WordWeight {
    /// Word
    pub word: String,
    /// Weight
    pub weight: f64,
}

/// Tokenize mode
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenizeMode {
    /// Default mode
    Default,
    /// Search mode
    Search,
}

/// Token
#[derive(Debug, Clone, PartialEq)]
pub struct Token(pub String, pub usize, pub usize);

impl Token {
    /// Word of the token
    pub fn word(&self) -> &str {
        &self.0
    }

    /// Unicode start position of the token
    pub fn start(&self) -> usize {
        self.1
    }

    /// Unicode end position of the token
    pub fn end(&self) -> usize {
        self.2
    }
}

impl Jieba {
    /// Create a new instance
    pub fn new(dict_path: &str, hmm_path: &str, user_dict_path: &str, idf_path: &str, stop_words_path: &str)
        -> Self
    {
        let c_dict_path = CString::new(dict_path).unwrap();
        let c_hmm_path = CString::new(hmm_path).unwrap();
        let c_user_dict_path = CString::new(user_dict_path).unwrap();
        let c_idf_path = CString::new(idf_path).unwrap();
        let c_stop_words_path = CString::new(stop_words_path).unwrap();
        unsafe {
            Self {
                inner: jieba_new(
                    c_dict_path.as_ptr(),
                    c_hmm_path.as_ptr(),
                    c_user_dict_path.as_ptr(),
                    c_idf_path.as_ptr(),
                    c_stop_words_path.as_ptr()
                )
            }
        }
    }

    /// Create a new instance from dict data  directory
    pub fn from_dir(data_dir: &str) -> Self {
        let data_path = Path::new(data_dir);
        let dict_path = data_path.join("jieba.dict.utf8");
        let hmm_path = data_path.join("hmm_model.utf8");
        let user_dict_path = data_path.join("user.dict.utf8");
        let idf_path = data_path.join("idf.utf8");
        let stop_words_path = data_path.join("stop_words.utf8");
        Self::new(
            dict_path.to_str().unwrap(),
            hmm_path.to_str().unwrap(),
            user_dict_path.to_str().unwrap(),
            idf_path.to_str().unwrap(),
            stop_words_path.to_str().unwrap(),
        )
    }

    /// Cut the input text
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `hmm`: enable HMM or not
    pub fn cut(&self, text: &str, hmm: bool) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        let is_hmm = if hmm { 1 } else { 0 };
        unsafe {
            let ret = jieba_cut(self.inner, c_text.as_ptr(), is_hmm);
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Cut the input text, return all possible words
    ///
    /// ## Params
    ///
    /// `text`: input text
    pub fn cut_all(&self, text: &str) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        unsafe {
            let ret = jieba_cut_all(self.inner, c_text.as_ptr());
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Cut the input text in search mode
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `hmm`: enable HMM or not
    pub fn cut_for_search(&self, text: &str, hmm: bool) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        let is_hmm = if hmm { 1 } else { 0 };
        unsafe {
            let ret = jieba_cut_for_search(self.inner, c_text.as_ptr(), is_hmm);
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Cut the input text using HMM
    ///
    /// ## Params
    ///
    /// `text`: input text
    pub fn cut_hmm(&self, text: &str) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        unsafe {
            let ret = jieba_cut_hmm(self.inner, c_text.as_ptr());
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Cut the input text but limit max word length
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `max_word_len`: max word length
    pub fn cut_small(&self, text: &str, max_word_len: usize) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        unsafe {
            let ret = jieba_cut_small(self.inner, c_text.as_ptr(), max_word_len);
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Tag the input text
    ///
    /// ## Params
    ///
    /// `text`: input text
    pub fn tag(&self, text: &str) -> Vec<Tag> {
        let c_text = CString::new(text).unwrap();
        unsafe {
            let ret = jieba_tag(self.inner, c_text.as_ptr());
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let tags = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s).to_string_lossy();
                let mut parts = word.splitn(2, '/');
                Tag {
                    word: parts.next().unwrap().to_string(),
                    flag: parts.next().unwrap().to_string(),
                }
            }).collect();
            jieba_words_free(ret);
            tags
        }
    }

    /// Look up an single word's tag
    pub fn lookup_tag(&self, word: &str) -> String {
        let c_word = CString::new(word).unwrap();
        unsafe {
            let ret = jieba_lookup_tag(self.inner, c_word.as_ptr());
            let tag = CStr::from_ptr(ret).to_string_lossy().into_owned();
            jieba_str_free(ret);
            tag
        }
    }

    /// Add user defined word
    pub fn add_user_word(&mut self, word: &str) {
        let c_word = CString::new(word).unwrap();
        unsafe {
            jieba_add_user_word(self.inner, c_word.as_ptr());
        }
    }

    /// Tokenize
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `mode`: tokenize mode
    ///
    /// `hmm`: enable HMM or not
    pub fn tokenize(&self, text: &str, mode: TokenizeMode, hmm: bool) -> Vec<Token> {
        let c_text = CString::new(text).unwrap();
        let c_mode = match mode {
            TokenizeMode::Default => JIEBA_TOKENIZE_MODE_DEFAULT,
            TokenizeMode::Search => JIEBA_TOKENIZE_MODE_SEARCH,
        };
        let is_hmm = if hmm { 1 } else { 0 };
        let mut tokens = Vec::new();
        unsafe {
            let ret = jieba_tokenize(self.inner, c_text.as_ptr(), c_mode, is_hmm);
            let mut index = 0;
            let mut c_token = ret.offset(index);
            while !c_token.is_null() && (*c_token).length > 0 {
                let start = (*c_token).offset as usize;
                let end = start + (*c_token).length as usize;
                let word = text[start..end].to_string();
                let unicode_start = (*c_token).unicode_offset as usize;
                let unicode_end = unicode_start + (*c_token).unicode_length as usize;
                tokens.push(Token(word, unicode_start, unicode_end));
                index += 1;
                c_token = ret.offset(index);
            }
            jieba_token_free(ret);
        }
        tokens
    }

    /// Extract keywords
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `top_k`: limit return keywords count
    pub fn extract(&self, text: &str, top_k: usize) -> Vec<String> {
        let c_text = CString::new(text).unwrap();
        unsafe {
            let ret = jieba_extract(self.inner, c_text.as_ptr(), top_k as i32);
            let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
            let words = c_words.into_iter().map(|s| {
                let word = CStr::from_ptr(*s);
                word.to_string_lossy().into_owned()
            }).collect();
            jieba_words_free(ret);
            words
        }
    }

    /// Extract keywords with weight
    ///
    /// ## Params
    ///
    /// `text`: input text
    ///
    /// `top_k`: limit return keywords count
    pub fn extract_with_weight(&self, text: &str, top_k: usize) -> Vec<WordWeight> {
        let c_text = CString::new(text).unwrap();
        let mut words = Vec::new();
        unsafe {
            let ret = jieba_extract_with_weight(self.inner, c_text.as_ptr(), top_k as i32);
            let mut index = 0;
            let mut c_word = ret.offset(index);
            while !c_word.is_null() && !(*c_word).word.is_null() {
                let word = CStr::from_ptr((*c_word).word).to_string_lossy().into_owned();
                words.push(WordWeight {
                    word: word,
                    weight: (*c_word).weight
                });
                index += 1;
                c_word = ret.offset(index);
            }
            jieba_word_weight_free(ret);
        }
        words
    }
}

impl Drop for Jieba {
    fn drop(&mut self) {
        if !self.inner.is_null() {
            unsafe { jieba_free(self.inner) };
        }
    }
}

unsafe impl Send for Jieba {}
unsafe impl Sync for Jieba {}