jieba_rs_siro/
lib.rs

1//! The Jieba Chinese Word Segmentation Implemented in Rust
2//!
3//! ## Installation
4//!
5//! Add it to your `Cargo.toml`:
6//!
7//! ```toml
8//! [dependencies]
9//! jieba-rs = "0.6"
10//! ```
11//!
12//! then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well.
13//!
14//! ## Example
15//!
16//! ```rust
17//! use jieba_rs::Jieba;
18//!
19//! let jieba = Jieba::new();
20//! let words = jieba.cut("我们中出了一个叛徒", false);
21//! assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
22//! ```
23//!
24//! ```rust
25//! # #[cfg(feature = "tfidf")] {
26//! use jieba_rs::Jieba;
27//! use jieba_rs::{TFIDF, KeywordExtract};
28//!
29//! fn main() {
30//!     let jieba = Jieba::new();
31//!     let keyword_extractor = TFIDF::new_with_jieba(&jieba);
32//!     let top_k = keyword_extractor.extract_tags(
33//!         "今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好，昨天纽约的天气也不好，北京烤鸭真好吃",
34//!         3,
35//!         vec![],
36//!     );
37//!     println!("{:?}", top_k);
38//! }
39//! # }
40//! ```
41//!
42//! ```rust
43//! # #[cfg(feature = "textrank")] {
44//! use jieba_rs::Jieba;
45//! use jieba_rs::{TextRank, KeywordExtract};
46//!
47//! fn main() {
48//!     let jieba = Jieba::new();
49//!     let keyword_extractor = TextRank::new_with_jieba(&jieba);
50//!     let top_k = keyword_extractor.extract_tags(
51//!         "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
52//!         6,
53//!         vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
54//!     );
55//!     println!("{:?}", top_k);
56//! }
57//! # }
58//! ```
59//!
60//! ## Enabling Additional Features
61//!
62//! * `default-dict` feature enables embedded dictionary, this features is enabled by default
63//! * `tfidf` feature enables TF-IDF keywords extractor
64//! * `textrank` feature enables TextRank keywords extractor
65//!
66//! ```toml
67//! [dependencies]
68//! jieba-rs = { version = "0.6", features = ["tfidf", "textrank"] }
69//! ```
70//!
71
72use lazy_static::lazy_static;
73
74use std::cmp::Ordering;
75use std::io::BufRead;
76
77use cedarwood::Cedar;
78use hashbrown::HashMap;
79use regex::{Match, Matches, Regex};
80
81pub(crate) type FxHashMap<K, V> = HashMap<K, V, fxhash::FxBuildHasher>;
82
83pub use crate::errors::Error;
84#[cfg(feature = "textrank")]
85pub use crate::keywords::textrank::TextRank;
86#[cfg(feature = "tfidf")]
87pub use crate::keywords::tfidf::TFIDF;
88#[cfg(any(feature = "tfidf", feature = "textrank"))]
89pub use crate::keywords::{Keyword, KeywordExtract};
90
91mod errors;
92mod hmm;
93#[cfg(any(feature = "tfidf", feature = "textrank"))]
94mod keywords;
95mod sparse_dag;
96
97#[cfg(feature = "default-dict")]
98static DEFAULT_DICT: &str = include_str!("data/dict.txt");
99
100use sparse_dag::StaticSparseDAG;
101
102lazy_static! {
103    static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
104    static ref RE_SKIP_DEAFULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
105    static ref RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
106    static ref RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
107}
108
109struct SplitMatches<'r, 't> {
110    finder: Matches<'r, 't>,
111    text: &'t str,
112    last: usize,
113    matched: Option<Match<'t>>,
114}
115
116impl<'r, 't> SplitMatches<'r, 't> {
117    #[inline]
118    fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> {
119        SplitMatches {
120            finder: re.find_iter(text),
121            text,
122            last: 0,
123            matched: None,
124        }
125    }
126}
127
128#[derive(Debug)]
129pub(crate) enum SplitState<'t> {
130    Unmatched(&'t str),
131    Matched(Match<'t>),
132}
133
134impl<'t> SplitState<'t> {
135    #[inline]
136    fn into_str(self) -> &'t str {
137        match self {
138            SplitState::Unmatched(t) => t,
139            SplitState::Matched(matched) => matched.as_str(),
140        }
141    }
142}
143
144impl<'r, 't> Iterator for SplitMatches<'r, 't> {
145    type Item = SplitState<'t>;
146
147    fn next(&mut self) -> Option<SplitState<'t>> {
148        if let Some(matched) = self.matched.take() {
149            return Some(SplitState::Matched(matched));
150        }
151        match self.finder.next() {
152            None => {
153                if self.last >= self.text.len() {
154                    None
155                } else {
156                    let s = &self.text[self.last..];
157                    self.last = self.text.len();
158                    Some(SplitState::Unmatched(s))
159                }
160            }
161            Some(m) => {
162                if self.last == m.start() {
163                    self.last = m.end();
164                    Some(SplitState::Matched(m))
165                } else {
166                    let unmatched = &self.text[self.last..m.start()];
167                    self.last = m.end();
168                    self.matched = Some(m);
169                    Some(SplitState::Unmatched(unmatched))
170                }
171            }
172        }
173    }
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
177pub enum TokenizeMode {
178    /// Default mode
179    Default,
180    /// Search mode
181    Search,
182}
183
184/// A Token
185#[derive(Debug, Clone, PartialEq, Eq, Hash)]
186pub struct Token<'a> {
187    /// Word of the token
188    pub word: &'a str,
189    /// Unicode start position of the token
190    pub start: usize,
191    /// Unicode end position of the token
192    pub end: usize,
193}
194
195/// A tagged word
196#[derive(Debug, Clone, PartialEq, Eq, Hash)]
197pub struct Tag<'a> {
198    /// Word
199    pub word: &'a str,
200    /// Word tag
201    pub tag: &'a str,
202}
203
204#[derive(Debug, Clone)]
205struct Record {
206    freq: usize,
207    tag: String,
208}
209
210impl Record {
211    #[inline(always)]
212    fn new(freq: usize, tag: String) -> Self {
213        Self { freq, tag }
214    }
215}
216
217/// Jieba segmentation
218#[derive(Debug, Clone)]
219pub struct Jieba {
220    records: Vec<Record>,
221    cedar: Cedar,
222    total: usize,
223    longest_word_len: usize,
224    re_han: Regex,
225    re_skip: Regex,
226    re_han_cut: Regex,
227    re_skip_cut: Regex,
228}
229
230#[cfg(feature = "default-dict")]
231impl Default for Jieba {
232    fn default() -> Self {
233        Jieba::new()
234    }
235}
236
237impl Jieba {
238    /// Create a new instance with empty dict
239    pub fn empty() -> Self {
240        Jieba {
241            records: Vec::new(),
242            cedar: Cedar::new(),
243            total: 0,
244            longest_word_len: 0,
245            re_han: RE_HAN_DEFAULT.clone(),
246            re_skip: RE_SKIP_DEAFULT.clone(),
247            re_han_cut: RE_HAN_CUT_ALL.clone(),
248            re_skip_cut: RE_SKIP_CUT_ALL.clone(),
249        }
250    }
251
252    /// Create a new instance with embed dict
253    ///
254    /// Requires `default-dict` feature to be enabled.
255    #[cfg(feature = "default-dict")]
256    pub fn new() -> Self {
257        use std::io::BufReader;
258
259        let mut instance = Self::empty();
260        let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
261        instance.load_dict(&mut default_dict).unwrap();
262        instance
263    }
264
265    /// Set set_re_han
266    pub fn set_re_han(&mut self, re: Regex) {
267        self.re_han = re;
268    }
269
270    /// Set set_re_han_cut
271    pub fn set_re_han_cut(&mut self, re: Regex) {
272        self.re_han_cut = re;
273    }
274
275    /// Set set_re_skip
276    pub fn set_re_skip(&mut self, re: Regex) {
277        self.re_skip = re;
278    }
279
280    /// Set re_skip_cut
281    pub fn set_re_skip_cut(&mut self, re: Regex) {
282        self.re_skip_cut = re;
283    }
284
285    /// Create a new instance with dict
286    pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
287        let mut instance = Self::empty();
288        instance.load_dict(dict)?;
289        Ok(instance)
290    }
291    /// Add word to dict, return `freq`
292    ///
293    /// `freq`: if `None`, will be given by [suggest_freq](#method.suggest_freq)
294    ///
295    /// `tag`: if `None`, will be given `""`
296    pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
297        let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
298        let tag = tag.unwrap_or("");
299
300        match self.cedar.exact_match_search(word) {
301            Some((word_id, _, _)) => {
302                let old_freq = self.records[word_id as usize].freq;
303                self.records[word_id as usize].freq = freq;
304
305                self.total += freq;
306                self.total -= old_freq;
307            }
308            None => {
309                self.records.push(Record::new(freq, String::from(tag)));
310                let word_id = (self.records.len() - 1) as i32;
311
312                self.cedar.update(word, word_id);
313                self.total += freq;
314            }
315        };
316
317        let curr_word_len = word.chars().count();
318        if self.longest_word_len < curr_word_len {
319            self.longest_word_len = curr_word_len;
320        }
321
322        freq
323    }
324
325    /// Load dictionary
326    pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
327        let mut buf = String::new();
328        self.total = 0;
329        self.longest_word_len = 0;
330
331        let mut line_no = 0;
332        while dict.read_line(&mut buf)? > 0 {
333            {
334                line_no += 1;
335                let mut iter = buf.trim().split_whitespace();
336                if let Some(word) = iter.next() {
337                    let freq = iter
338                        .next()
339                        .map(|x| {
340                            x.parse::<usize>().map_err(|e| {
341                                Error::InvalidDictEntry(format!(
342                                    "line {} `{}` frequency {} is not a valid integer: {}",
343                                    line_no, buf, x, e
344                                ))
345                            })
346                        })
347                        .unwrap_or(Ok(0))?;
348                    let tag = iter.next().unwrap_or("");
349
350                    let curr_word_len = word.chars().count();
351                    if self.longest_word_len < curr_word_len {
352                        self.longest_word_len = curr_word_len;
353                    }
354
355                    match self.cedar.exact_match_search(word) {
356                        Some((word_id, _, _)) => {
357                            self.records[word_id as usize].freq = freq;
358                        }
359                        None => {
360                            self.records.push(Record::new(freq, String::from(tag)));
361                            let word_id = (self.records.len() - 1) as i32;
362                            self.cedar.update(word, word_id);
363                        }
364                    };
365                }
366            }
367            buf.clear();
368        }
369        self.total = self.records.iter().map(|n| n.freq).sum();
370
371        Ok(())
372    }
373
374    fn get_word_freq(&self, word: &str, default: usize) -> usize {
375        match self.cedar.exact_match_search(word) {
376            Some((word_id, _, _)) => self.records[word_id as usize].freq,
377            _ => default,
378        }
379    }
380
381    /// Suggest word frequency to force the characters in a word to be joined or splitted.
382    pub fn suggest_freq(&self, segment: &str) -> usize {
383        let logtotal = (self.total as f64).ln();
384        let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| {
385            freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal
386        });
387        std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
388    }
389
390    #[allow(clippy::ptr_arg)]
391    fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
392        let str_len = sentence.len();
393
394        if str_len + 1 > route.len() {
395            route.resize(str_len + 1, (0.0, 0));
396        }
397
398        let logtotal = (self.total as f64).ln();
399        let mut prev_byte_start = str_len;
400        let curr = sentence.char_indices().map(|x| x.0).rev();
401        for byte_start in curr {
402            let pair = dag
403                .iter_edges(byte_start)
404                .map(|byte_end| {
405                    let wfrag = if byte_end == str_len {
406                        &sentence[byte_start..]
407                    } else {
408                        &sentence[byte_start..byte_end]
409                    };
410
411                    let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) {
412                        self.records[word_id as usize].freq
413                    } else {
414                        1
415                    };
416
417                    ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end)
418                })
419                .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
420
421            if let Some(p) = pair {
422                route[byte_start] = p;
423            } else {
424                let byte_end = prev_byte_start;
425                let freq = 1;
426                route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end);
427            }
428
429            prev_byte_start = byte_start;
430        }
431    }
432
433    fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
434        for (byte_start, _) in sentence.char_indices().peekable() {
435            dag.start(byte_start);
436            let haystack = &sentence[byte_start..];
437
438            for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
439                dag.insert(end_index + byte_start + 1);
440            }
441
442            dag.commit();
443        }
444    }
445
446    fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
447        let str_len = sentence.len();
448        let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
449        self.dag(sentence, &mut dag);
450
451        let curr = sentence.char_indices().map(|x| x.0);
452        for byte_start in curr {
453            for byte_end in dag.iter_edges(byte_start) {
454                let word = if byte_end == str_len {
455                    &sentence[byte_start..]
456                } else {
457                    &sentence[byte_start..byte_end]
458                };
459
460                words.push(word)
461            }
462        }
463    }
464
465    fn cut_dag_no_hmm<'a>(
466        &self,
467        sentence: &'a str,
468        words: &mut Vec<&'a str>,
469        route: &mut Vec<(f64, usize)>,
470        dag: &mut StaticSparseDAG,
471    ) {
472        self.dag(sentence, dag);
473        self.calc(sentence, dag, route);
474        let mut x = 0;
475        let mut left: Option<usize> = None;
476
477        while x < sentence.len() {
478            let y = route[x].1;
479            let l_str = if y < sentence.len() {
480                &sentence[x..y]
481            } else {
482                &sentence[x..]
483            };
484
485            if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) {
486                if left.is_none() {
487                    left = Some(x);
488                }
489            } else {
490                if let Some(byte_start) = left {
491                    let word = &sentence[byte_start..x];
492                    words.push(word);
493                    left = None;
494                }
495
496                let word = if y < sentence.len() {
497                    &sentence[x..y]
498                } else {
499                    &sentence[x..]
500                };
501
502                words.push(word);
503            }
504            x = y;
505        }
506
507        if let Some(byte_start) = left {
508            let word = &sentence[byte_start..];
509            words.push(word);
510        }
511
512        dag.clear();
513        route.clear();
514    }
515
516    #[allow(non_snake_case, clippy::too_many_arguments)]
517    fn cut_dag_hmm<'a>(
518        &self,
519        sentence: &'a str,
520        words: &mut Vec<&'a str>,
521        route: &mut Vec<(f64, usize)>,
522        dag: &mut StaticSparseDAG,
523        V: &mut Vec<f64>,
524        prev: &mut Vec<Option<hmm::Status>>,
525        path: &mut Vec<hmm::Status>,
526    ) {
527        self.dag(sentence, dag);
528        self.calc(sentence, dag, route);
529        let mut x = 0;
530        let mut left: Option<usize> = None;
531
532        while x < sentence.len() {
533            let y = route[x].1;
534
535            if sentence[x..y].chars().count() == 1 {
536                if left.is_none() {
537                    left = Some(x);
538                }
539            } else {
540                if let Some(byte_start) = left {
541                    let byte_end = x;
542                    let word = if byte_end < sentence.len() {
543                        &sentence[byte_start..byte_end]
544                    } else {
545                        &sentence[byte_start..]
546                    };
547
548                    if word.chars().count() == 1 {
549                        words.push(word);
550                    } else if self.cedar.exact_match_search(word).is_none() {
551                        hmm::cut_with_allocated_memory(word, words, V, prev, path);
552                    } else {
553                        let mut word_indices = word.char_indices().map(|x| x.0).peekable();
554                        while let Some(byte_start) = word_indices.next() {
555                            if let Some(byte_end) = word_indices.peek() {
556                                words.push(&word[byte_start..*byte_end]);
557                            } else {
558                                words.push(&word[byte_start..]);
559                            }
560                        }
561                    }
562                    left = None;
563                }
564                let word = if y < sentence.len() {
565                    &sentence[x..y]
566                } else {
567                    &sentence[x..]
568                };
569                words.push(word);
570            }
571            x = y;
572        }
573
574        if let Some(byte_start) = left {
575            let word = &sentence[byte_start..];
576
577            if word.chars().count() == 1 {
578                words.push(word);
579            } else if self.cedar.exact_match_search(word).is_none() {
580                hmm::cut(word, words);
581            } else {
582                let mut word_indices = word.char_indices().map(|x| x.0).peekable();
583                while let Some(byte_start) = word_indices.next() {
584                    if let Some(byte_end) = word_indices.peek() {
585                        words.push(&word[byte_start..*byte_end]);
586                    } else {
587                        words.push(&word[byte_start..]);
588                    }
589                }
590            }
591        }
592
593        dag.clear();
594        route.clear();
595    }
596
597    #[allow(non_snake_case)]
598    fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
599        let heuristic_capacity = sentence.len() / 2;
600        let mut words = Vec::with_capacity(heuristic_capacity);
601        let re_han: &Regex = if cut_all { &self.re_han_cut } else { &self.re_han };
602        let re_skip: &Regex = if cut_all { &self.re_skip_cut } else { &self.re_skip };
603        let splitter = SplitMatches::new(&re_han, sentence);
604        let mut route = Vec::with_capacity(heuristic_capacity);
605        let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
606
607        let R = 4;
608        let C = sentence.chars().count();
609        let mut V = if hmm { vec![0.0; R * C] } else { Vec::new() };
610        let mut prev: Vec<Option<hmm::Status>> = if hmm { vec![None; R * C] } else { Vec::new() };
611        let mut path: Vec<hmm::Status> = if hmm { vec![hmm::Status::B; C] } else { Vec::new() };
612
613        for state in splitter {
614            match state {
615                SplitState::Matched(_) => {
616                    let block = state.into_str();
617                    assert!(!block.is_empty());
618
619                    if cut_all {
620                        self.cut_all_internal(block, &mut words);
621                    } else if hmm {
622                        self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut V, &mut prev, &mut path);
623                    } else {
624                        self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
625                    }
626                }
627                SplitState::Unmatched(_) => {
628                    let block = state.into_str();
629                    assert!(!block.is_empty());
630
631                    let skip_splitter = SplitMatches::new(&re_skip, block);
632                    for skip_state in skip_splitter {
633                        let word = skip_state.into_str();
634                        if word.is_empty() {
635                            continue;
636                        }
637                        if cut_all || re_skip.is_match(word) {
638                            words.push(word);
639                        } else {
640                            let mut word_indices = word.char_indices().map(|x| x.0).peekable();
641                            while let Some(byte_start) = word_indices.next() {
642                                if let Some(byte_end) = word_indices.peek() {
643                                    words.push(&word[byte_start..*byte_end]);
644                                } else {
645                                    words.push(&word[byte_start..]);
646                                }
647                            }
648                        }
649                    }
650                }
651            }
652        }
653        words
654    }
655
656    /// Cut the input text
657    ///
658    /// ## Params
659    ///
660    /// `sentence`: input text
661    ///
662    /// `hmm`: enable HMM or not
663    pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
664        self.cut_internal(sentence, false, hmm)
665    }
666
667    /// Cut the input text, return all possible words
668    ///
669    /// ## Params
670    ///
671    /// `sentence`: input text
672    pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
673        self.cut_internal(sentence, true, false)
674    }
675
676    /// Cut the input text in search mode
677    ///
678    /// ## Params
679    ///
680    /// `sentence`: input text
681    ///
682    /// `hmm`: enable HMM or not
683    pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
684        let words = self.cut(sentence, hmm);
685        let mut new_words = Vec::with_capacity(words.len());
686        for word in words {
687            let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
688            let char_count = char_indices.len();
689            if char_count > 2 {
690                for i in 0..char_count - 1 {
691                    let byte_start = char_indices[i];
692                    let gram2 = if i + 2 < char_count {
693                        &word[byte_start..char_indices[i + 2]]
694                    } else {
695                        &word[byte_start..]
696                    };
697                    if self.cedar.exact_match_search(gram2).is_some() {
698                        new_words.push(gram2);
699                    }
700                }
701            }
702            if char_count > 3 {
703                for i in 0..char_count - 2 {
704                    let byte_start = char_indices[i];
705                    let gram3 = if i + 3 < char_count {
706                        &word[byte_start..char_indices[i + 3]]
707                    } else {
708                        &word[byte_start..]
709                    };
710                    if self.cedar.exact_match_search(gram3).is_some() {
711                        new_words.push(gram3);
712                    }
713                }
714            }
715            new_words.push(word);
716        }
717        new_words
718    }
719
720    /// Tokenize
721    ///
722    /// ## Params
723    ///
724    /// `sentence`: input text
725    ///
726    /// `mode`: tokenize mode
727    ///
728    /// `hmm`: enable HMM or not
729    pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
730        let words = self.cut(sentence, hmm);
731        let mut tokens = Vec::with_capacity(words.len());
732        let mut start = 0;
733        match mode {
734            TokenizeMode::Default => {
735                for word in words {
736                    let width = word.chars().count();
737                    tokens.push(Token {
738                        word,
739                        start,
740                        end: start + width,
741                    });
742                    start += width;
743                }
744            }
745            TokenizeMode::Search => {
746                for word in words {
747                    let width = word.chars().count();
748                    if width > 2 {
749                        let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
750                        for i in 0..width - 1 {
751                            let byte_start = char_indices[i];
752                            let gram2 = if i + 2 < width {
753                                &word[byte_start..char_indices[i + 2]]
754                            } else {
755                                &word[byte_start..]
756                            };
757                            if self.cedar.exact_match_search(gram2).is_some() {
758                                tokens.push(Token {
759                                    word: gram2,
760                                    start: start + i,
761                                    end: start + i + 2,
762                                });
763                            }
764                        }
765                        if width > 3 {
766                            for i in 0..width - 2 {
767                                let byte_start = char_indices[i];
768                                let gram3 = if i + 3 < width {
769                                    &word[byte_start..char_indices[i + 3]]
770                                } else {
771                                    &word[byte_start..]
772                                };
773                                if self.cedar.exact_match_search(gram3).is_some() {
774                                    tokens.push(Token {
775                                        word: gram3,
776                                        start: start + i,
777                                        end: start + i + 3,
778                                    });
779                                }
780                            }
781                        }
782                    }
783                    tokens.push(Token {
784                        word,
785                        start,
786                        end: start + width,
787                    });
788                    start += width;
789                }
790            }
791        }
792        tokens
793    }
794
795    /// Tag the input text
796    ///
797    /// ## Params
798    ///
799    /// `sentence`: input text
800    ///
801    /// `hmm`: enable HMM or not
802    pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag> {
803        let words = self.cut(sentence, hmm);
804        words
805            .into_iter()
806            .map(|word| {
807                if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
808                    let t = &self.records[word_id as usize].tag;
809                    return Tag { word, tag: t };
810                }
811                let mut eng = 0;
812                let mut m = 0;
813                for chr in word.chars() {
814                    if chr.is_ascii_alphanumeric() {
815                        eng += 1;
816                        if chr.is_ascii_digit() {
817                            m += 1;
818                        }
819                    }
820                }
821                let tag = if eng == 0 {
822                    "x"
823                } else if eng == m {
824                    "m"
825                } else {
826                    "eng"
827                };
828                Tag { word, tag }
829            })
830            .collect()
831    }
832}
833
834#[cfg(test)]
835mod tests {
836    use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT};
837    use std::io::BufReader;
838
839    #[test]
840    fn test_init_with_default_dict() {
841        let _ = Jieba::new();
842    }
843
844    #[test]
845    fn test_split_matches() {
846        let re_han = &*RE_HAN_DEFAULT;
847        let splitter = SplitMatches::new(
848            &re_han,
849            "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
850        );
851        for state in splitter {
852            match state {
853                SplitState::Matched(_) => {
854                    let block = state.into_str();
855                    assert_eq!(block.is_empty(), false);
856                }
857                SplitState::Unmatched(_) => {
858                    let block = state.into_str();
859                    assert_eq!(block.is_empty(), false);
860                }
861            }
862        }
863    }
864
865    #[test]
866    fn test_split_matches_against_unicode_sip() {
867        let re_han = &*RE_HAN_DEFAULT;
868        let splitter = SplitMatches::new(&re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
869
870        let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
871        assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
872    }
873
874    #[test]
875    fn test_cut_all() {
876        let jieba = Jieba::new();
877        let words = jieba.cut_all("abc网球拍卖会def");
878        assert_eq!(
879            words,
880            vec![
881                "abc",
882                "网",
883                "网球",
884                "网球拍",
885                "球",
886                "球拍",
887                "拍",
888                "拍卖",
889                "拍卖会",
890                "卖",
891                "会",
892                "def",
893            ]
894        );
895
896        // The cut_all from the python de-facto implementation is loosely defined,
897        // And the answer "我, 来到, 北京, 清华, 清华大学, 华大, 大学" from the python implementation looks weird since it drops the single character word even though it is part of the DAG candidates.
898        // For example, it includes "华大" but it doesn't include "清" and "学"
899        let words = jieba.cut_all("我来到北京清华大学");
900        assert_eq!(
901            words,
902            vec![
903                "我",
904                "来",
905                "来到",
906                "到",
907                "北",
908                "北京",
909                "京",
910                "清",
911                "清华",
912                "清华大学",
913                "华",
914                "华大",
915                "大",
916                "大学",
917                "学",
918            ]
919        );
920    }
921
922    #[test]
923    fn test_cut_no_hmm() {
924        let jieba = Jieba::new();
925        let words = jieba.cut("abc网球拍卖会def", false);
926        assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]);
927    }
928
929    #[test]
930    fn test_cut_with_hmm() {
931        let jieba = Jieba::new();
932        let words = jieba.cut("我们中出了一个叛徒", false);
933        assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
934        let words = jieba.cut("我们中出了一个叛徒", true);
935        assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]);
936        let words = jieba.cut("我们中出了一个叛徒👪", true);
937        assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]);
938
939        let words = jieba.cut("我来到北京清华大学", true);
940        assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]);
941
942        let words = jieba.cut("他来到了网易杭研大厦", true);
943        assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]);
944    }
945
946    #[test]
947    fn test_cut_weicheng() {
948        static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt");
949        let jieba = Jieba::new();
950        for line in WEICHENG_TXT.split('\n') {
951            let _ = jieba.cut(line, true);
952        }
953    }
954
955    #[test]
956    fn test_cut_for_search() {
957        let jieba = Jieba::new();
958        let words = jieba.cut_for_search("南京市长江大桥", true);
959        assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]);
960
961        let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造", true);
962
963        // The python implementation silently filtered "，". but we includes it here in the output
964        // to let the library user to decide their own filtering strategy
965        assert_eq!(
966            words,
967            vec![
968                "小明",
969                "硕士",
970                "毕业",
971                "于",
972                "中国",
973                "科学",
974                "学院",
975                "科学院",
976                "中国科学院",
977                "计算",
978                "计算所",
979                "，",
980                "后",
981                "在",
982                "日本",
983                "京都",
984                "大学",
985                "日本京都大学",
986                "深造",
987            ]
988        );
989    }
990
991    #[test]
992    fn test_tag() {
993        let jieba = Jieba::new();
994        let tags = jieba.tag(
995            "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。",
996            true,
997        );
998        assert_eq!(
999            tags,
1000            vec![
1001                Tag { word: "我", tag: "r" },
1002                Tag { word: "是", tag: "v" },
1003                Tag {
1004                    word: "拖拉机",
1005                    tag: "n",
1006                },
1007                Tag {
1008                    word: "学院",
1009                    tag: "n",
1010                },
1011                Tag {
1012                    word: "手扶拖拉机",
1013                    tag: "n",
1014                },
1015                Tag {
1016                    word: "专业",
1017                    tag: "n",
1018                },
1019                Tag { word: "的", tag: "uj" },
1020                Tag { word: "。", tag: "x" },
1021                Tag {
1022                    word: "不用",
1023                    tag: "v",
1024                },
1025                Tag {
1026                    word: "多久",
1027                    tag: "m",
1028                },
1029                Tag { word: "，", tag: "x" },
1030                Tag { word: "我", tag: "r" },
1031                Tag { word: "就", tag: "d" },
1032                Tag { word: "会", tag: "v" },
1033                Tag {
1034                    word: "升职",
1035                    tag: "v",
1036                },
1037                Tag {
1038                    word: "加薪",
1039                    tag: "nr",
1040                },
1041                Tag { word: "，", tag: "x" },
1042                Tag {
1043                    word: "当上",
1044                    tag: "t",
1045                },
1046                Tag {
1047                    word: "CEO",
1048                    tag: "eng",
1049                },
1050                Tag { word: "，", tag: "x" },
1051                Tag {
1052                    word: "走上",
1053                    tag: "v",
1054                },
1055                Tag {
1056                    word: "人生",
1057                    tag: "n",
1058                },
1059                Tag {
1060                    word: "巅峰",
1061                    tag: "n",
1062                },
1063                Tag { word: "。", tag: "x" },
1064            ]
1065        );
1066
1067        let tags = jieba.tag("今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1068        assert_eq!(
1069            tags,
1070            vec![
1071                Tag {
1072                    word: "今天",
1073                    tag: "t",
1074                },
1075                Tag {
1076                    word: "纽约",
1077                    tag: "ns",
1078                },
1079                Tag { word: "的", tag: "uj" },
1080                Tag {
1081                    word: "天气",
1082                    tag: "n",
1083                },
1084                Tag {
1085                    word: "真好",
1086                    tag: "d",
1087                },
1088                Tag { word: "啊", tag: "zg" },
1089                Tag { word: "，", tag: "x" },
1090                Tag {
1091                    word: "京华",
1092                    tag: "nz",
1093                },
1094                Tag {
1095                    word: "大酒店",
1096                    tag: "n",
1097                },
1098                Tag { word: "的", tag: "uj" },
1099                Tag {
1100                    word: "张尧",
1101                    tag: "x",
1102                }, // XXX: missing in dict
1103                Tag {
1104                    word: "经理",
1105                    tag: "n",
1106                },
1107                Tag { word: "吃", tag: "v" },
1108                Tag { word: "了", tag: "ul" },
1109                Tag {
1110                    word: "一只",
1111                    tag: "m",
1112                },
1113                Tag {
1114                    word: "北京烤鸭",
1115                    tag: "n",
1116                },
1117                Tag { word: "。", tag: "x" },
1118            ]
1119        );
1120    }
1121
1122    #[test]
1123    fn test_tokenize() {
1124        let jieba = Jieba::new();
1125        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1126        assert_eq!(
1127            tokens,
1128            vec![
1129                Token {
1130                    word: "南京市",
1131                    start: 0,
1132                    end: 3,
1133                },
1134                Token {
1135                    word: "长江大桥",
1136                    start: 3,
1137                    end: 7,
1138                },
1139            ]
1140        );
1141
1142        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1143        assert_eq!(
1144            tokens,
1145            vec![
1146                Token {
1147                    word: "南京",
1148                    start: 0,
1149                    end: 2,
1150                },
1151                Token {
1152                    word: "京市",
1153                    start: 1,
1154                    end: 3,
1155                },
1156                Token {
1157                    word: "南京市",
1158                    start: 0,
1159                    end: 3,
1160                },
1161                Token {
1162                    word: "长江",
1163                    start: 3,
1164                    end: 5,
1165                },
1166                Token {
1167                    word: "大桥",
1168                    start: 5,
1169                    end: 7,
1170                },
1171                Token {
1172                    word: "长江大桥",
1173                    start: 3,
1174                    end: 7,
1175                },
1176            ]
1177        );
1178
1179        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1180        assert_eq!(
1181            tokens,
1182            vec![
1183                Token {
1184                    word: "我们",
1185                    start: 0,
1186                    end: 2,
1187                },
1188                Token {
1189                    word: "中",
1190                    start: 2,
1191                    end: 3,
1192                },
1193                Token {
1194                    word: "出",
1195                    start: 3,
1196                    end: 4,
1197                },
1198                Token {
1199                    word: "了",
1200                    start: 4,
1201                    end: 5,
1202                },
1203                Token {
1204                    word: "一个",
1205                    start: 5,
1206                    end: 7,
1207                },
1208                Token {
1209                    word: "叛徒",
1210                    start: 7,
1211                    end: 9,
1212                },
1213            ]
1214        );
1215        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1216        assert_eq!(
1217            tokens,
1218            vec![
1219                Token {
1220                    word: "我们",
1221                    start: 0,
1222                    end: 2,
1223                },
1224                Token {
1225                    word: "中出",
1226                    start: 2,
1227                    end: 4,
1228                },
1229                Token {
1230                    word: "了",
1231                    start: 4,
1232                    end: 5,
1233                },
1234                Token {
1235                    word: "一个",
1236                    start: 5,
1237                    end: 7,
1238                },
1239                Token {
1240                    word: "叛徒",
1241                    start: 7,
1242                    end: 9,
1243                },
1244            ]
1245        );
1246
1247        let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1248        assert_eq!(
1249            tokens,
1250            vec![
1251                Token {
1252                    word: "永和",
1253                    start: 0,
1254                    end: 2,
1255                },
1256                Token {
1257                    word: "服装",
1258                    start: 2,
1259                    end: 4,
1260                },
1261                Token {
1262                    word: "饰品",
1263                    start: 4,
1264                    end: 6,
1265                },
1266                Token {
1267                    word: "有限公司",
1268                    start: 6,
1269                    end: 10,
1270                },
1271            ]
1272        );
1273    }
1274
1275    #[test]
1276    fn test_userdict() {
1277        let mut jieba = Jieba::new();
1278        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1279        assert_eq!(
1280            tokens,
1281            vec![
1282                Token {
1283                    word: "我们",
1284                    start: 0,
1285                    end: 2,
1286                },
1287                Token {
1288                    word: "中",
1289                    start: 2,
1290                    end: 3,
1291                },
1292                Token {
1293                    word: "出",
1294                    start: 3,
1295                    end: 4,
1296                },
1297                Token {
1298                    word: "了",
1299                    start: 4,
1300                    end: 5,
1301                },
1302                Token {
1303                    word: "一个",
1304                    start: 5,
1305                    end: 7,
1306                },
1307                Token {
1308                    word: "叛徒",
1309                    start: 7,
1310                    end: 9,
1311                },
1312            ]
1313        );
1314        let userdict = "中出 10000";
1315        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1316        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1317        assert_eq!(
1318            tokens,
1319            vec![
1320                Token {
1321                    word: "我们",
1322                    start: 0,
1323                    end: 2,
1324                },
1325                Token {
1326                    word: "中出",
1327                    start: 2,
1328                    end: 4,
1329                },
1330                Token {
1331                    word: "了",
1332                    start: 4,
1333                    end: 5,
1334                },
1335                Token {
1336                    word: "一个",
1337                    start: 5,
1338                    end: 7,
1339                },
1340                Token {
1341                    word: "叛徒",
1342                    start: 7,
1343                    end: 9,
1344                },
1345            ]
1346        );
1347    }
1348
1349    #[test]
1350    fn test_userdict_hmm() {
1351        let mut jieba = Jieba::new();
1352        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1353        assert_eq!(
1354            tokens,
1355            vec![
1356                Token {
1357                    word: "我们",
1358                    start: 0,
1359                    end: 2,
1360                },
1361                Token {
1362                    word: "中出",
1363                    start: 2,
1364                    end: 4,
1365                },
1366                Token {
1367                    word: "了",
1368                    start: 4,
1369                    end: 5,
1370                },
1371                Token {
1372                    word: "一个",
1373                    start: 5,
1374                    end: 7,
1375                },
1376                Token {
1377                    word: "叛徒",
1378                    start: 7,
1379                    end: 9,
1380                },
1381            ]
1382        );
1383        let userdict = "出了 10000";
1384        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1385        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1386        assert_eq!(
1387            tokens,
1388            vec![
1389                Token {
1390                    word: "我们",
1391                    start: 0,
1392                    end: 2,
1393                },
1394                Token {
1395                    word: "中",
1396                    start: 2,
1397                    end: 3,
1398                },
1399                Token {
1400                    word: "出了",
1401                    start: 3,
1402                    end: 5,
1403                },
1404                Token {
1405                    word: "一个",
1406                    start: 5,
1407                    end: 7,
1408                },
1409                Token {
1410                    word: "叛徒",
1411                    start: 7,
1412                    end: 9,
1413                },
1414            ]
1415        );
1416    }
1417
1418    #[test]
1419    fn test_userdict_error() {
1420        let mut jieba = Jieba::empty();
1421        let userdict = "出了 not_a_int";
1422        let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1423        assert!(ret.is_err());
1424    }
1425
1426    #[test]
1427    fn test_suggest_freq() {
1428        // NOTE: Following behaviors are aligned with original Jieba
1429
1430        let mut jieba = Jieba::new();
1431        // These values were calculated by original Jieba
1432        assert_eq!(jieba.suggest_freq("中出"), 348);
1433        assert_eq!(jieba.suggest_freq("出了"), 1263);
1434
1435        // Freq in dict.txt was 3, which became 300 after loading user dict
1436        let userdict = "中出 300";
1437        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1438        // But it's less than calculated freq 348
1439        assert_eq!(jieba.suggest_freq("中出"), 348);
1440
1441        let userdict = "中出 500";
1442        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1443        // Now it's significant enough
1444        assert_eq!(jieba.suggest_freq("中出"), 500)
1445    }
1446
1447    #[test]
1448    fn test_custom_lower_freq() {
1449        let mut jieba = Jieba::new();
1450
1451        jieba.add_word("测试", Some(2445), None);
1452        jieba.add_word("测试", Some(10), None);
1453        let words = jieba.cut("测试", false);
1454        assert_eq!(words, vec!["测试"]);
1455    }
1456
1457    #[test]
1458    fn test_cut_dag_no_hmm_against_string_with_sip() {
1459        let mut jieba = Jieba::empty();
1460
1461        //add fake word into dictionary
1462        jieba.add_word("䶴䶵𦡦", Some(1000), None);
1463        jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
1464
1465        let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
1466        assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]);
1467    }
1468
1469    #[test]
1470    fn test_add_custom_word_with_underscrore() {
1471        let mut jieba = Jieba::empty();
1472        jieba.add_word("田-女士", Some(42), Some("n"));
1473        let words = jieba.cut("市民田-女士急匆匆", false);
1474        assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]);
1475    }
1476}
jieba_rs_siro/lib.rs

jieba_rs_siro/
lib.rs