1extern crate cjieba_sys;
27
28use std::slice;
29use std::path::Path;
30use std::ffi::{CString, CStr};
31
32use cjieba_sys::*;
33
34#[derive(Debug, Clone)]
35pub struct Jieba {
36 inner: *mut jieba_t,
37}
38
39#[derive(Debug, Clone, PartialEq)]
41pub struct Tag {
42 pub word: String,
44 pub flag: String,
46}
47
48#[derive(Debug, Clone, PartialEq)]
50pub struct WordWeight {
51 pub word: String,
53 pub weight: f64,
55}
56
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
59pub enum TokenizeMode {
60 Default,
62 Search,
64}
65
66#[derive(Debug, Clone, PartialEq)]
68pub struct Token(pub String, pub usize, pub usize);
69
70impl Token {
71 pub fn word(&self) -> &str {
73 &self.0
74 }
75
76 pub fn start(&self) -> usize {
78 self.1
79 }
80
81 pub fn end(&self) -> usize {
83 self.2
84 }
85}
86
87impl Jieba {
88 pub fn new(dict_path: &str, hmm_path: &str, user_dict_path: &str, idf_path: &str, stop_words_path: &str)
90 -> Self
91 {
92 let c_dict_path = CString::new(dict_path).unwrap();
93 let c_hmm_path = CString::new(hmm_path).unwrap();
94 let c_user_dict_path = CString::new(user_dict_path).unwrap();
95 let c_idf_path = CString::new(idf_path).unwrap();
96 let c_stop_words_path = CString::new(stop_words_path).unwrap();
97 unsafe {
98 Self {
99 inner: jieba_new(
100 c_dict_path.as_ptr(),
101 c_hmm_path.as_ptr(),
102 c_user_dict_path.as_ptr(),
103 c_idf_path.as_ptr(),
104 c_stop_words_path.as_ptr()
105 )
106 }
107 }
108 }
109
110 pub fn from_dir(data_dir: &str) -> Self {
112 let data_path = Path::new(data_dir);
113 let dict_path = data_path.join("jieba.dict.utf8");
114 let hmm_path = data_path.join("hmm_model.utf8");
115 let user_dict_path = data_path.join("user.dict.utf8");
116 let idf_path = data_path.join("idf.utf8");
117 let stop_words_path = data_path.join("stop_words.utf8");
118 Self::new(
119 dict_path.to_str().unwrap(),
120 hmm_path.to_str().unwrap(),
121 user_dict_path.to_str().unwrap(),
122 idf_path.to_str().unwrap(),
123 stop_words_path.to_str().unwrap(),
124 )
125 }
126
127 pub fn cut(&self, text: &str, hmm: bool) -> Vec<String> {
135 let c_text = CString::new(text).unwrap();
136 let is_hmm = if hmm { 1 } else { 0 };
137 unsafe {
138 let ret = jieba_cut(self.inner, c_text.as_ptr(), is_hmm);
139 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
140 let words = c_words.into_iter().map(|s| {
141 let word = CStr::from_ptr(*s);
142 word.to_string_lossy().into_owned()
143 }).collect();
144 jieba_words_free(ret);
145 words
146 }
147 }
148
149 pub fn cut_all(&self, text: &str) -> Vec<String> {
155 let c_text = CString::new(text).unwrap();
156 unsafe {
157 let ret = jieba_cut_all(self.inner, c_text.as_ptr());
158 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
159 let words = c_words.into_iter().map(|s| {
160 let word = CStr::from_ptr(*s);
161 word.to_string_lossy().into_owned()
162 }).collect();
163 jieba_words_free(ret);
164 words
165 }
166 }
167
168 pub fn cut_for_search(&self, text: &str, hmm: bool) -> Vec<String> {
176 let c_text = CString::new(text).unwrap();
177 let is_hmm = if hmm { 1 } else { 0 };
178 unsafe {
179 let ret = jieba_cut_for_search(self.inner, c_text.as_ptr(), is_hmm);
180 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
181 let words = c_words.into_iter().map(|s| {
182 let word = CStr::from_ptr(*s);
183 word.to_string_lossy().into_owned()
184 }).collect();
185 jieba_words_free(ret);
186 words
187 }
188 }
189
190 pub fn cut_hmm(&self, text: &str) -> Vec<String> {
196 let c_text = CString::new(text).unwrap();
197 unsafe {
198 let ret = jieba_cut_hmm(self.inner, c_text.as_ptr());
199 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
200 let words = c_words.into_iter().map(|s| {
201 let word = CStr::from_ptr(*s);
202 word.to_string_lossy().into_owned()
203 }).collect();
204 jieba_words_free(ret);
205 words
206 }
207 }
208
209 pub fn cut_small(&self, text: &str, max_word_len: usize) -> Vec<String> {
217 let c_text = CString::new(text).unwrap();
218 unsafe {
219 let ret = jieba_cut_small(self.inner, c_text.as_ptr(), max_word_len);
220 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
221 let words = c_words.into_iter().map(|s| {
222 let word = CStr::from_ptr(*s);
223 word.to_string_lossy().into_owned()
224 }).collect();
225 jieba_words_free(ret);
226 words
227 }
228 }
229
230 pub fn tag(&self, text: &str) -> Vec<Tag> {
236 let c_text = CString::new(text).unwrap();
237 unsafe {
238 let ret = jieba_tag(self.inner, c_text.as_ptr());
239 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
240 let tags = c_words.into_iter().map(|s| {
241 let word = CStr::from_ptr(*s).to_string_lossy();
242 let mut parts = word.splitn(2, '/');
243 Tag {
244 word: parts.next().unwrap().to_string(),
245 flag: parts.next().unwrap().to_string(),
246 }
247 }).collect();
248 jieba_words_free(ret);
249 tags
250 }
251 }
252
253 pub fn lookup_tag(&self, word: &str) -> String {
255 let c_word = CString::new(word).unwrap();
256 unsafe {
257 let ret = jieba_lookup_tag(self.inner, c_word.as_ptr());
258 let tag = CStr::from_ptr(ret).to_string_lossy().into_owned();
259 jieba_str_free(ret);
260 tag
261 }
262 }
263
264 pub fn add_user_word(&mut self, word: &str) {
266 let c_word = CString::new(word).unwrap();
267 unsafe {
268 jieba_add_user_word(self.inner, c_word.as_ptr());
269 }
270 }
271
272 pub fn tokenize(&self, text: &str, mode: TokenizeMode, hmm: bool) -> Vec<Token> {
282 let c_text = CString::new(text).unwrap();
283 let c_mode = match mode {
284 TokenizeMode::Default => JIEBA_TOKENIZE_MODE_DEFAULT,
285 TokenizeMode::Search => JIEBA_TOKENIZE_MODE_SEARCH,
286 };
287 let is_hmm = if hmm { 1 } else { 0 };
288 let mut tokens = Vec::new();
289 unsafe {
290 let ret = jieba_tokenize(self.inner, c_text.as_ptr(), c_mode, is_hmm);
291 let mut index = 0;
292 let mut c_token = ret.offset(index);
293 while !c_token.is_null() && (*c_token).length > 0 {
294 let start = (*c_token).offset as usize;
295 let end = start + (*c_token).length as usize;
296 let word = text[start..end].to_string();
297 let unicode_start = (*c_token).unicode_offset as usize;
298 let unicode_end = unicode_start + (*c_token).unicode_length as usize;
299 tokens.push(Token(word, unicode_start, unicode_end));
300 index += 1;
301 c_token = ret.offset(index);
302 }
303 jieba_token_free(ret);
304 }
305 tokens
306 }
307
308 pub fn extract(&self, text: &str, top_k: usize) -> Vec<String> {
316 let c_text = CString::new(text).unwrap();
317 unsafe {
318 let ret = jieba_extract(self.inner, c_text.as_ptr(), top_k as i32);
319 let c_words = slice::from_raw_parts((*ret).words, (*ret).length);
320 let words = c_words.into_iter().map(|s| {
321 let word = CStr::from_ptr(*s);
322 word.to_string_lossy().into_owned()
323 }).collect();
324 jieba_words_free(ret);
325 words
326 }
327 }
328
329 pub fn extract_with_weight(&self, text: &str, top_k: usize) -> Vec<WordWeight> {
337 let c_text = CString::new(text).unwrap();
338 let mut words = Vec::new();
339 unsafe {
340 let ret = jieba_extract_with_weight(self.inner, c_text.as_ptr(), top_k as i32);
341 let mut index = 0;
342 let mut c_word = ret.offset(index);
343 while !c_word.is_null() && !(*c_word).word.is_null() {
344 let word = CStr::from_ptr((*c_word).word).to_string_lossy().into_owned();
345 words.push(WordWeight {
346 word: word,
347 weight: (*c_word).weight
348 });
349 index += 1;
350 c_word = ret.offset(index);
351 }
352 jieba_word_weight_free(ret);
353 }
354 words
355 }
356}
357
358impl Drop for Jieba {
359 fn drop(&mut self) {
360 if !self.inner.is_null() {
361 unsafe { jieba_free(self.inner) };
362 }
363 }
364}
365
366unsafe impl Send for Jieba {}
367unsafe impl Sync for Jieba {}