jieba_rs/
lib.rs

1//! The Jieba Chinese Word Segmentation Implemented in Rust
2//!
3//! ## Installation
4//!
5//! Add it to your `Cargo.toml`:
6//!
7//! ```toml
8//! [dependencies]
9//! jieba-rs = "0.7"
10//! ```
11//!
12//! then you are good to go. If you are using Rust 2015 you have to ``extern crate jieba_rs`` to your crate root as well.
13//!
14//! ## Example
15//!
16//! ```rust
17//! use jieba_rs::Jieba;
18//!
19//! let jieba = Jieba::new();
20//! let words = jieba.cut("我们中出了一个叛徒", false);
21//! assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
22//! ```
23//!
24//! ```rust
25//! # #[cfg(feature = "tfidf")] {
26//! use jieba_rs::Jieba;
27//! use jieba_rs::{TfIdf, KeywordExtract};
28//!
29//! fn main() {
30//!     let jieba = Jieba::new();
31//!     let keyword_extractor = TfIdf::default();
32//!     let top_k = keyword_extractor.extract_keywords(
33//!         &jieba,
34//!         "今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。后天纽约的天气不好，昨天纽约的天气也不好，北京烤鸭真好吃",
35//!         3,
36//!         vec![],
37//!     );
38//!     println!("{:?}", top_k);
39//! }
40//! # }
41//! ```
42//!
43//! ```rust
44//! # #[cfg(feature = "textrank")] {
45//! use jieba_rs::Jieba;
46//! use jieba_rs::{TextRank, KeywordExtract};
47//!
48//! fn main() {
49//!     let jieba = Jieba::new();
50//!     let keyword_extractor = TextRank::default();
51//!     let top_k = keyword_extractor.extract_keywords(
52//!         &jieba,
53//!         "此外，公司拟对全资子公司吉林欧亚置业有限公司增资4.3亿元，增资后，吉林欧亚置业注册资本由7000万元增加到5亿元。吉林欧亚置业主要经营范围为房地产开发及百货零售等业务。目前在建吉林欧亚城市商业综合体项目。2013年，实现营业收入0万元，实现净利润-139.13万元。",
54//!         6,
55//!         vec![String::from("ns"), String::from("n"), String::from("vn"), String::from("v")],
56//!     );
57//!     println!("{:?}", top_k);
58//! }
59//! # }
60//! ```
61//!
62//! ## Enabling Additional Features
63//!
64//! * `default-dict` feature enables embedded dictionary, this features is enabled by default
65//! * `tfidf` feature enables TF-IDF keywords extractor
66//! * `textrank` feature enables TextRank keywords extractor
67//!
68//! ```toml
69//! [dependencies]
70//! jieba-rs = { version = "0.7", features = ["tfidf", "textrank"] }
71//! ```
72//!
73
74use include_flate::flate;
75
76use std::cmp::Ordering;
77use std::collections::HashMap;
78use std::io::BufRead;
79
80use cedarwood::Cedar;
81use regex::{Match, Matches, Regex};
82
83pub(crate) type FxHashMap<K, V> = HashMap<K, V, rustc_hash::FxBuildHasher>;
84
85pub use crate::errors::Error;
86#[cfg(feature = "textrank")]
87pub use crate::keywords::textrank::TextRank;
88#[cfg(feature = "tfidf")]
89pub use crate::keywords::tfidf::TfIdf;
90#[cfg(any(feature = "tfidf", feature = "textrank"))]
91pub use crate::keywords::{DEFAULT_STOP_WORDS, Keyword, KeywordExtract, KeywordExtractConfig};
92
93mod errors;
94mod hmm;
95#[cfg(any(feature = "tfidf", feature = "textrank"))]
96mod keywords;
97mod sparse_dag;
98
99#[cfg(feature = "default-dict")]
100flate!(static DEFAULT_DICT: str from "src/data/dict.txt");
101
102use sparse_dag::StaticSparseDAG;
103
104thread_local! {
105    static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
106    static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
107    static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
108    static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
109    static HMM_CONTEXT: std::cell::RefCell<hmm::HmmContext> = std::cell::RefCell::new(hmm::HmmContext::default());
110}
111
112struct SplitMatches<'r, 't> {
113    finder: Matches<'r, 't>,
114    text: &'t str,
115    last: usize,
116    matched: Option<Match<'t>>,
117}
118
119impl<'r, 't> SplitMatches<'r, 't> {
120    #[inline]
121    fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> {
122        SplitMatches {
123            finder: re.find_iter(text),
124            text,
125            last: 0,
126            matched: None,
127        }
128    }
129}
130
131#[derive(Debug)]
132pub(crate) enum SplitState<'t> {
133    Unmatched(&'t str),
134    Matched(Match<'t>),
135}
136
137impl<'t> SplitState<'t> {
138    #[inline]
139    fn as_str(&self) -> &'t str {
140        match self {
141            SplitState::Unmatched(t) => t,
142            SplitState::Matched(matched) => matched.as_str(),
143        }
144    }
145
146    #[inline]
147    pub fn is_matched(&self) -> bool {
148        matches!(self, SplitState::Matched(_))
149    }
150}
151
152impl<'t> Iterator for SplitMatches<'_, 't> {
153    type Item = SplitState<'t>;
154
155    fn next(&mut self) -> Option<SplitState<'t>> {
156        if let Some(matched) = self.matched.take() {
157            return Some(SplitState::Matched(matched));
158        }
159        match self.finder.next() {
160            None => {
161                if self.last >= self.text.len() {
162                    None
163                } else {
164                    let s = &self.text[self.last..];
165                    self.last = self.text.len();
166                    Some(SplitState::Unmatched(s))
167                }
168            }
169            Some(m) => {
170                if self.last == m.start() {
171                    self.last = m.end();
172                    Some(SplitState::Matched(m))
173                } else {
174                    let unmatched = &self.text[self.last..m.start()];
175                    self.last = m.end();
176                    self.matched = Some(m);
177                    Some(SplitState::Unmatched(unmatched))
178                }
179            }
180        }
181    }
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185pub enum TokenizeMode {
186    /// Default mode
187    Default,
188    /// Search mode
189    Search,
190}
191
192/// A Token
193#[derive(Debug, Clone, PartialEq, Eq, Hash)]
194pub struct Token<'a> {
195    /// Word of the token
196    pub word: &'a str,
197    /// Unicode start position of the token
198    pub start: usize,
199    /// Unicode end position of the token
200    pub end: usize,
201}
202
203/// A tagged word
204#[derive(Debug, Clone, PartialEq, Eq, Hash)]
205pub struct Tag<'a> {
206    /// Word
207    pub word: &'a str,
208    /// Word tag
209    pub tag: &'a str,
210}
211
212#[derive(Debug, Clone)]
213struct Record {
214    freq: usize,
215    tag: String,
216}
217
218impl Record {
219    #[inline(always)]
220    fn new(freq: usize, tag: String) -> Self {
221        Self { freq, tag }
222    }
223}
224
225/// Jieba segmentation
226#[derive(Debug, Clone)]
227pub struct Jieba {
228    records: Vec<Record>,
229    cedar: Cedar,
230    total: usize,
231}
232
233#[cfg(feature = "default-dict")]
234impl Default for Jieba {
235    fn default() -> Self {
236        Jieba::new()
237    }
238}
239
240impl Jieba {
241    /// Create a new instance with empty dict
242    pub fn empty() -> Self {
243        Jieba {
244            records: Vec::new(),
245            cedar: Cedar::new(),
246            total: 0,
247        }
248    }
249
250    /// Create a new instance with embed dict
251    ///
252    /// Requires `default-dict` feature to be enabled.
253    #[cfg(feature = "default-dict")]
254    pub fn new() -> Self {
255        let mut instance = Self::empty();
256        instance.load_default_dict();
257        instance
258    }
259
260    /// Create a new instance with dict
261    pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
262        let mut instance = Self::empty();
263        instance.load_dict(dict)?;
264        Ok(instance)
265    }
266
267    /// Loads the default dictionary into the instance.
268    ///
269    /// This method reads the default dictionary from a predefined byte slice (`DEFAULT_DICT`)
270    /// and loads it into the current instance using the `load_dict` method.
271    ///
272    /// # Arguments
273    ///
274    /// * `&mut self` - Mutable reference to the current instance.
275    ///
276    /// Requires `default-dict` feature to be enabled.
277    ///
278    /// # Examples
279    ///
280    /// ```
281    /// use jieba_rs::Jieba;
282    ///
283    /// let mut instance = Jieba::empty();
284    /// instance.load_default_dict(); // Loads the default dictionary into the instance
285    /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
286    /// ```
287    #[cfg(feature = "default-dict")]
288    pub fn load_default_dict(&mut self) {
289        use std::io::BufReader;
290
291        let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
292        self.load_dict(&mut default_dict).unwrap();
293    }
294
295    /// Clears all data
296    ///
297    /// This method performs the following actions:
298    /// 1. Clears the `records` list, removing all entries.
299    /// 2. Resets `cedar` to a new instance of `Cedar`.
300    /// 3. Sets `total` to 0, resetting the count.
301    ///
302    /// # Arguments
303    ///
304    /// * `&mut self` - Mutable reference to the current instance.
305    ///
306    /// # Examples
307    ///
308    /// ```
309    /// use jieba_rs::Jieba;
310    ///
311    /// let mut instance = Jieba::new();
312    /// assert!(instance.has_word("我们"), "The word '我们' should be in the dictionary after loading the default dictionary");
313    /// instance.clear(); // clear all dict data
314    /// assert!(!instance.has_word("我们"), "The word '我们' should not be in the dictionary after clearing the dictionary");
315    /// ```
316    pub fn clear(&mut self) {
317        self.records.clear();
318        self.cedar = Cedar::new();
319        self.total = 0;
320    }
321
322    /// Add word to dict, return `freq`
323    ///
324    /// `freq`: if `None`, will be given by [suggest_freq](#method.suggest_freq)
325    ///
326    /// `tag`: if `None`, will be given `""`
327    pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
328        if word.is_empty() {
329            return 0;
330        }
331        let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
332        let tag = tag.unwrap_or("");
333
334        match self.cedar.exact_match_search(word) {
335            Some((word_id, _, _)) => {
336                let old_freq = self.records[word_id as usize].freq;
337                self.records[word_id as usize].freq = freq;
338
339                self.total += freq;
340                self.total -= old_freq;
341            }
342            None => {
343                let word_id = self.records.len() as i32;
344                self.records.push(Record::new(freq, String::from(tag)));
345
346                self.cedar.update(word, word_id);
347                self.total += freq;
348            }
349        };
350
351        freq
352    }
353
354    /// Checks if a word exists in the dictionary.
355    ///
356    /// # Arguments
357    ///
358    /// * `word` - The word to check.
359    ///
360    /// # Returns
361    ///
362    /// * `bool` - Whether the word exists in the dictionary.
363    pub fn has_word(&self, word: &str) -> bool {
364        self.cedar.exact_match_search(word).is_some()
365    }
366
367    /// Loads a dictionary by adding entries to the existing dictionary rather than resetting it.
368    ///
369    /// This function reads from a `BufRead` source, parsing each line as a dictionary entry. Each entry
370    /// is expected to contain a word, its frequency, and optionally a tag.
371    ///
372    /// # Type Parameters
373    ///
374    /// * `R`: A type that implements the `BufRead` trait, used for reading lines from the dictionary.
375    ///
376    /// # Arguments
377    ///
378    /// * `dict` - A mutable reference to a `BufRead` source containing the dictionary entries.
379    ///
380    /// # Returns
381    ///
382    /// * `Result<(), Error>` - Returns `Ok(())` if the dictionary is successfully loaded; otherwise,
383    ///   returns an error describing what went wrong.
384    ///
385    /// # Errors
386    ///
387    /// This function will return an error if:
388    /// * There is an issue reading from the provided `BufRead` source.
389    /// * A line in the dictionary file contains invalid frequency data (not a valid integer).
390    pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
391        let mut buf = String::new();
392        self.total = 0;
393
394        let mut line_no = 0;
395        while dict.read_line(&mut buf)? > 0 {
396            {
397                line_no += 1;
398                let mut iter = buf.split_whitespace();
399                if let Some(word) = iter.next() {
400                    let freq = iter
401                        .next()
402                        .map(|x| {
403                            x.parse::<usize>().map_err(|e| {
404                                Error::InvalidDictEntry(format!(
405                                    "line {} `{}` frequency {} is not a valid integer: {}",
406                                    line_no, buf, x, e
407                                ))
408                            })
409                        })
410                        .unwrap_or(Ok(0))?;
411                    let tag = iter.next().unwrap_or("");
412
413                    match self.cedar.exact_match_search(word) {
414                        Some((word_id, _, _)) => {
415                            self.records[word_id as usize].freq = freq;
416                        }
417                        None => {
418                            let word_id = self.records.len() as i32;
419                            self.records.push(Record::new(freq, String::from(tag)));
420                            self.cedar.update(word, word_id);
421                        }
422                    };
423                }
424            }
425            buf.clear();
426        }
427        self.total = self.records.iter().map(|n| n.freq).sum();
428
429        Ok(())
430    }
431
432    fn get_word_freq(&self, word: &str, default: usize) -> usize {
433        match self.cedar.exact_match_search(word) {
434            Some((word_id, _, _)) => self.records[word_id as usize].freq,
435            _ => default,
436        }
437    }
438
439    /// Suggest word frequency to force the characters in a word to be joined or split.
440    pub fn suggest_freq(&self, segment: &str) -> usize {
441        let logtotal = (self.total as f64).ln();
442        let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| {
443            freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal
444        });
445        std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
446    }
447
448    #[allow(clippy::ptr_arg)]
449    fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
450        let str_len = sentence.len();
451
452        if str_len + 1 > route.len() {
453            route.resize(str_len + 1, (0.0, 0));
454        }
455
456        let logtotal = (self.total as f64).ln();
457        let mut prev_byte_start = str_len;
458        let curr = sentence.char_indices().map(|x| x.0).rev();
459        for byte_start in curr {
460            let pair = dag
461                .iter_edges(byte_start)
462                .map(|byte_end| {
463                    let wfrag = &sentence[byte_start..byte_end];
464
465                    let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) {
466                        self.records[word_id as usize].freq
467                    } else {
468                        1
469                    };
470
471                    ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end)
472                })
473                .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
474
475            if let Some(p) = pair {
476                route[byte_start] = p;
477            } else {
478                let byte_end = prev_byte_start;
479                let freq = 1;
480                route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end);
481            }
482
483            prev_byte_start = byte_start;
484        }
485    }
486
487    fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
488        for (byte_start, _) in sentence.char_indices().peekable() {
489            dag.start(byte_start);
490            let haystack = &sentence[byte_start..];
491
492            for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
493                dag.insert(end_index + byte_start + 1);
494            }
495
496            dag.commit();
497        }
498    }
499
500    fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
501        let str_len = sentence.len();
502        let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
503        self.dag(sentence, &mut dag);
504
505        let curr = sentence.char_indices().map(|x| x.0);
506        for byte_start in curr {
507            for byte_end in dag.iter_edges(byte_start) {
508                let word = if byte_end == str_len {
509                    &sentence[byte_start..]
510                } else {
511                    &sentence[byte_start..byte_end]
512                };
513
514                words.push(word)
515            }
516        }
517    }
518
519    fn cut_dag_no_hmm<'a>(
520        &self,
521        sentence: &'a str,
522        words: &mut Vec<&'a str>,
523        route: &mut Vec<(f64, usize)>,
524        dag: &mut StaticSparseDAG,
525    ) {
526        self.dag(sentence, dag);
527        self.calc(sentence, dag, route);
528        let mut x = 0;
529        let mut left: Option<usize> = None;
530
531        while x < sentence.len() {
532            let y = route[x].1;
533            let l_str = &sentence[x..y];
534
535            if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) {
536                if left.is_none() {
537                    left = Some(x);
538                }
539            } else {
540                if let Some(byte_start) = left {
541                    let word = &sentence[byte_start..x];
542                    words.push(word);
543                    left = None;
544                }
545
546                words.push(l_str);
547            }
548            x = y;
549        }
550
551        if let Some(byte_start) = left {
552            let word = &sentence[byte_start..];
553            words.push(word);
554        }
555
556        dag.clear();
557        route.clear();
558    }
559
560    #[allow(non_snake_case, clippy::too_many_arguments)]
561    fn cut_dag_hmm<'a>(
562        &self,
563        sentence: &'a str,
564        words: &mut Vec<&'a str>,
565        route: &mut Vec<(f64, usize)>,
566        dag: &mut StaticSparseDAG,
567        hmm_context: &mut hmm::HmmContext,
568    ) {
569        self.dag(sentence, dag);
570        self.calc(sentence, dag, route);
571        let mut x = 0;
572        let mut left: Option<usize> = None;
573
574        while x < sentence.len() {
575            let y = route[x].1;
576
577            if sentence[x..y].chars().count() == 1 {
578                if left.is_none() {
579                    left = Some(x);
580                }
581            } else {
582                if let Some(byte_start) = left {
583                    let byte_end = x;
584                    let word = &sentence[byte_start..byte_end];
585                    if word.chars().count() == 1 {
586                        words.push(word);
587                    } else if self.cedar.exact_match_search(word).is_none() {
588                        hmm::cut_with_allocated_memory(word, words, hmm_context);
589                    } else {
590                        let mut word_indices = word.char_indices().map(|x| x.0).peekable();
591                        while let Some(byte_start) = word_indices.next() {
592                            if let Some(byte_end) = word_indices.peek() {
593                                words.push(&word[byte_start..*byte_end]);
594                            } else {
595                                words.push(&word[byte_start..]);
596                            }
597                        }
598                    }
599                    left = None;
600                }
601                let word = &sentence[x..y];
602                words.push(word);
603            }
604            x = y;
605        }
606
607        if let Some(byte_start) = left {
608            let word = &sentence[byte_start..];
609
610            if word.chars().count() == 1 {
611                words.push(word);
612            } else if self.cedar.exact_match_search(word).is_none() {
613                hmm::cut(word, words);
614            } else {
615                let mut word_indices = word.char_indices().map(|x| x.0).peekable();
616                while let Some(byte_start) = word_indices.next() {
617                    if let Some(byte_end) = word_indices.peek() {
618                        words.push(&word[byte_start..*byte_end]);
619                    } else {
620                        words.push(&word[byte_start..]);
621                    }
622                }
623            }
624        }
625
626        dag.clear();
627        route.clear();
628    }
629
630    #[allow(non_snake_case)]
631    fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
632        let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
633        let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
634
635        re_han.with(|re_han| {
636            re_skip.with(|re_skip| {
637                let heuristic_capacity = sentence.len() / 2;
638                let mut words = Vec::with_capacity(heuristic_capacity);
639
640                let splitter = SplitMatches::new(re_han, sentence);
641                let mut route = Vec::with_capacity(heuristic_capacity);
642                let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
643
644                for state in splitter {
645                    match state {
646                        SplitState::Matched(_) => {
647                            let block = state.as_str();
648                            assert!(!block.is_empty());
649
650                            if cut_all {
651                                self.cut_all_internal(block, &mut words);
652                            } else if hmm {
653                                HMM_CONTEXT.with(|ctx| {
654                                    let mut hmm_context = ctx.borrow_mut();
655                                    self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
656                                });
657                            } else {
658                                self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
659                            }
660                        }
661                        SplitState::Unmatched(_) => {
662                            let block = state.as_str();
663                            assert!(!block.is_empty());
664
665                            let skip_splitter = SplitMatches::new(re_skip, block);
666                            for skip_state in skip_splitter {
667                                let word = skip_state.as_str();
668                                if word.is_empty() {
669                                    continue;
670                                }
671                                if cut_all || skip_state.is_matched() {
672                                    words.push(word);
673                                } else {
674                                    let mut word_indices = word.char_indices().map(|x| x.0).peekable();
675                                    while let Some(byte_start) = word_indices.next() {
676                                        if let Some(byte_end) = word_indices.peek() {
677                                            words.push(&word[byte_start..*byte_end]);
678                                        } else {
679                                            words.push(&word[byte_start..]);
680                                        }
681                                    }
682                                }
683                            }
684                        }
685                    }
686                }
687                words
688            })
689        })
690    }
691
692    /// Cut the input text
693    ///
694    /// ## Params
695    ///
696    /// `sentence`: input text
697    ///
698    /// `hmm`: enable HMM or not
699    pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
700        self.cut_internal(sentence, false, hmm)
701    }
702
703    /// Cut the input text, return all possible words
704    ///
705    /// ## Params
706    ///
707    /// `sentence`: input text
708    pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
709        self.cut_internal(sentence, true, false)
710    }
711
712    /// Cut the input text in search mode
713    ///
714    /// ## Params
715    ///
716    /// `sentence`: input text
717    ///
718    /// `hmm`: enable HMM or not
719    pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
720        let words = self.cut(sentence, hmm);
721        let mut new_words = Vec::with_capacity(words.len());
722        for word in words {
723            let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
724            let char_count = char_indices.len();
725            if char_count > 2 {
726                for i in 0..char_count - 1 {
727                    let byte_start = char_indices[i];
728                    let gram2 = if i + 2 < char_count {
729                        &word[byte_start..char_indices[i + 2]]
730                    } else {
731                        &word[byte_start..]
732                    };
733                    if self.cedar.exact_match_search(gram2).is_some() {
734                        new_words.push(gram2);
735                    }
736                }
737            }
738            if char_count > 3 {
739                for i in 0..char_count - 2 {
740                    let byte_start = char_indices[i];
741                    let gram3 = if i + 3 < char_count {
742                        &word[byte_start..char_indices[i + 3]]
743                    } else {
744                        &word[byte_start..]
745                    };
746                    if self.cedar.exact_match_search(gram3).is_some() {
747                        new_words.push(gram3);
748                    }
749                }
750            }
751            new_words.push(word);
752        }
753        new_words
754    }
755
756    /// Tokenize
757    ///
758    /// ## Params
759    ///
760    /// `sentence`: input text
761    ///
762    /// `mode`: tokenize mode
763    ///
764    /// `hmm`: enable HMM or not
765    pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
766        let words = self.cut(sentence, hmm);
767        let mut tokens = Vec::with_capacity(words.len());
768        let mut start = 0;
769        match mode {
770            TokenizeMode::Default => {
771                for word in words {
772                    let width = word.chars().count();
773                    tokens.push(Token {
774                        word,
775                        start,
776                        end: start + width,
777                    });
778                    start += width;
779                }
780            }
781            TokenizeMode::Search => {
782                for word in words {
783                    let width = word.chars().count();
784                    if width > 2 {
785                        let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
786                        for i in 0..width - 1 {
787                            let byte_start = char_indices[i];
788                            let gram2 = if i + 2 < width {
789                                &word[byte_start..char_indices[i + 2]]
790                            } else {
791                                &word[byte_start..]
792                            };
793                            if self.cedar.exact_match_search(gram2).is_some() {
794                                tokens.push(Token {
795                                    word: gram2,
796                                    start: start + i,
797                                    end: start + i + 2,
798                                });
799                            }
800                        }
801                        if width > 3 {
802                            for i in 0..width - 2 {
803                                let byte_start = char_indices[i];
804                                let gram3 = if i + 3 < width {
805                                    &word[byte_start..char_indices[i + 3]]
806                                } else {
807                                    &word[byte_start..]
808                                };
809                                if self.cedar.exact_match_search(gram3).is_some() {
810                                    tokens.push(Token {
811                                        word: gram3,
812                                        start: start + i,
813                                        end: start + i + 3,
814                                    });
815                                }
816                            }
817                        }
818                    }
819                    tokens.push(Token {
820                        word,
821                        start,
822                        end: start + width,
823                    });
824                    start += width;
825                }
826            }
827        }
828        tokens
829    }
830
831    /// Tag the input text
832    ///
833    /// ## Params
834    ///
835    /// `sentence`: input text
836    ///
837    /// `hmm`: enable HMM or not
838    pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
839        let words = self.cut(sentence, hmm);
840        words
841            .into_iter()
842            .map(|word| {
843                if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
844                    let t = &self.records[word_id as usize].tag;
845                    return Tag { word, tag: t };
846                }
847                let mut eng = 0;
848                let mut m = 0;
849                for chr in word.chars() {
850                    if chr.is_ascii_alphanumeric() {
851                        eng += 1;
852                        if chr.is_ascii_digit() {
853                            m += 1;
854                        }
855                    }
856                }
857                let tag = if eng == 0 {
858                    "x"
859                } else if eng == m {
860                    "m"
861                } else {
862                    "eng"
863                };
864                Tag { word, tag }
865            })
866            .collect()
867    }
868}
869
870#[cfg(test)]
871mod tests {
872    use super::{Jieba, RE_HAN_DEFAULT, SplitMatches, SplitState, Tag, Token, TokenizeMode};
873    use std::io::BufReader;
874
875    #[test]
876    fn test_init_with_default_dict() {
877        let _ = Jieba::new();
878    }
879
880    #[test]
881    fn test_has_word() {
882        let jieba = Jieba::new();
883        assert!(jieba.has_word("中国"));
884        assert!(jieba.has_word("开源"));
885        assert!(!jieba.has_word("不存在的词"));
886    }
887
888    #[test]
889    fn test_split_matches() {
890        RE_HAN_DEFAULT.with(|re_han| {
891            let splitter = SplitMatches::new(
892                re_han,
893                "👪 PS: 我觉得开源有一个好处，就是能够敦促自己不断改进 👪，避免敞帚自珍",
894            );
895            for state in splitter {
896                match state {
897                    SplitState::Matched(_) => {
898                        let block = state.as_str();
899                        assert!(!block.is_empty());
900                    }
901                    SplitState::Unmatched(_) => {
902                        let block = state.as_str();
903                        assert!(!block.is_empty());
904                    }
905                }
906            }
907        });
908    }
909
910    #[test]
911    fn test_split_matches_against_unicode_sip() {
912        RE_HAN_DEFAULT.with(|re_han| {
913            let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
914
915            let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
916            assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
917        });
918    }
919
920    #[test]
921    fn test_cut_all() {
922        let jieba = Jieba::new();
923        let words = jieba.cut_all("abc网球拍卖会def");
924        assert_eq!(
925            words,
926            vec![
927                "abc",
928                "网",
929                "网球",
930                "网球拍",
931                "球",
932                "球拍",
933                "拍",
934                "拍卖",
935                "拍卖会",
936                "卖",
937                "会",
938                "def"
939            ]
940        );
941
942        // The cut_all from the python de-facto implementation is loosely defined,
943        // And the answer "我, 来到, 北京, 清华, 清华大学, 华大, 大学" from the python implementation looks weird since it drops the single character word even though it is part of the DAG candidates.
944        // For example, it includes "华大" but it doesn't include "清" and "学"
945        let words = jieba.cut_all("我来到北京清华大学");
946        assert_eq!(
947            words,
948            vec![
949                "我",
950                "来",
951                "来到",
952                "到",
953                "北",
954                "北京",
955                "京",
956                "清",
957                "清华",
958                "清华大学",
959                "华",
960                "华大",
961                "大",
962                "大学",
963                "学"
964            ]
965        );
966    }
967
968    #[test]
969    fn test_cut_no_hmm() {
970        let jieba = Jieba::new();
971        let words = jieba.cut("abc网球拍卖会def", false);
972        assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]);
973    }
974
975    #[test]
976    fn test_cut_no_hmm1() {
977        let jieba = Jieba::new();
978        let words = jieba.cut("abc网球拍卖会def！！？\r\n\t", false);
979        assert_eq!(
980            words,
981            vec!["abc", "网球", "拍卖会", "def", "！", "！", "？", "\r\n", "\t"]
982        );
983    }
984
985    #[test]
986    fn test_cut_with_hmm() {
987        let jieba = Jieba::new();
988        let words = jieba.cut("我们中出了一个叛徒", false);
989        assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
990        let words = jieba.cut("我们中出了一个叛徒", true);
991        assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]);
992        let words = jieba.cut("我们中出了一个叛徒👪", true);
993        assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]);
994
995        let words = jieba.cut("我来到北京清华大学", true);
996        assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]);
997
998        let words = jieba.cut("他来到了网易杭研大厦", true);
999        assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]);
1000    }
1001
1002    #[test]
1003    fn test_cut_weicheng() {
1004        static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt");
1005        let jieba = Jieba::new();
1006        for line in WEICHENG_TXT.split('\n') {
1007            let _ = jieba.cut(line, true);
1008        }
1009    }
1010
1011    #[test]
1012    fn test_cut_for_search() {
1013        let jieba = Jieba::new();
1014        let words = jieba.cut_for_search("南京市长江大桥", true);
1015        assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]);
1016
1017        let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造", true);
1018
1019        // The python implementation silently filtered "，". but we includes it here in the output
1020        // to let the library user to decide their own filtering strategy
1021        assert_eq!(
1022            words,
1023            vec![
1024                "小明",
1025                "硕士",
1026                "毕业",
1027                "于",
1028                "中国",
1029                "科学",
1030                "学院",
1031                "科学院",
1032                "中国科学院",
1033                "计算",
1034                "计算所",
1035                "，",
1036                "后",
1037                "在",
1038                "日本",
1039                "京都",
1040                "大学",
1041                "日本京都大学",
1042                "深造"
1043            ]
1044        );
1045    }
1046
1047    #[test]
1048    fn test_tag() {
1049        let jieba = Jieba::new();
1050        let tags = jieba.tag(
1051            "我是拖拉机学院手扶拖拉机专业的。不用多久，我就会升职加薪，当上CEO，走上人生巅峰。",
1052            true,
1053        );
1054        assert_eq!(
1055            tags,
1056            vec![
1057                Tag { word: "我", tag: "r" },
1058                Tag { word: "是", tag: "v" },
1059                Tag {
1060                    word: "拖拉机",
1061                    tag: "n"
1062                },
1063                Tag {
1064                    word: "学院", tag: "n"
1065                },
1066                Tag {
1067                    word: "手扶拖拉机",
1068                    tag: "n"
1069                },
1070                Tag {
1071                    word: "专业", tag: "n"
1072                },
1073                Tag { word: "的", tag: "uj" },
1074                Tag { word: "。", tag: "x" },
1075                Tag {
1076                    word: "不用", tag: "v"
1077                },
1078                Tag {
1079                    word: "多久", tag: "m"
1080                },
1081                Tag { word: "，", tag: "x" },
1082                Tag { word: "我", tag: "r" },
1083                Tag { word: "就", tag: "d" },
1084                Tag { word: "会", tag: "v" },
1085                Tag {
1086                    word: "升职", tag: "v"
1087                },
1088                Tag {
1089                    word: "加薪",
1090                    tag: "nr"
1091                },
1092                Tag { word: "，", tag: "x" },
1093                Tag {
1094                    word: "当上", tag: "t"
1095                },
1096                Tag {
1097                    word: "CEO",
1098                    tag: "eng"
1099                },
1100                Tag { word: "，", tag: "x" },
1101                Tag {
1102                    word: "走上", tag: "v"
1103                },
1104                Tag {
1105                    word: "人生", tag: "n"
1106                },
1107                Tag {
1108                    word: "巅峰", tag: "n"
1109                },
1110                Tag { word: "。", tag: "x" }
1111            ]
1112        );
1113
1114        let tags = jieba.tag("今天纽约的天气真好啊，京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1115        assert_eq!(
1116            tags,
1117            vec![
1118                Tag {
1119                    word: "今天", tag: "t"
1120                },
1121                Tag {
1122                    word: "纽约",
1123                    tag: "ns"
1124                },
1125                Tag { word: "的", tag: "uj" },
1126                Tag {
1127                    word: "天气", tag: "n"
1128                },
1129                Tag {
1130                    word: "真好", tag: "d"
1131                },
1132                Tag { word: "啊", tag: "zg" },
1133                Tag { word: "，", tag: "x" },
1134                Tag {
1135                    word: "京华",
1136                    tag: "nz"
1137                },
1138                Tag {
1139                    word: "大酒店",
1140                    tag: "n"
1141                },
1142                Tag { word: "的", tag: "uj" },
1143                Tag {
1144                    word: "张尧", tag: "x"
1145                }, // XXX: missing in dict
1146                Tag {
1147                    word: "经理", tag: "n"
1148                },
1149                Tag { word: "吃", tag: "v" },
1150                Tag { word: "了", tag: "ul" },
1151                Tag {
1152                    word: "一只", tag: "m"
1153                },
1154                Tag {
1155                    word: "北京烤鸭",
1156                    tag: "n"
1157                },
1158                Tag { word: "。", tag: "x" }
1159            ]
1160        );
1161    }
1162
1163    #[test]
1164    fn test_tokenize() {
1165        let jieba = Jieba::new();
1166        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1167        assert_eq!(
1168            tokens,
1169            vec![
1170                Token {
1171                    word: "南京市",
1172                    start: 0,
1173                    end: 3
1174                },
1175                Token {
1176                    word: "长江大桥",
1177                    start: 3,
1178                    end: 7
1179                }
1180            ]
1181        );
1182
1183        let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1184        assert_eq!(
1185            tokens,
1186            vec![
1187                Token {
1188                    word: "南京",
1189                    start: 0,
1190                    end: 2
1191                },
1192                Token {
1193                    word: "京市",
1194                    start: 1,
1195                    end: 3
1196                },
1197                Token {
1198                    word: "南京市",
1199                    start: 0,
1200                    end: 3
1201                },
1202                Token {
1203                    word: "长江",
1204                    start: 3,
1205                    end: 5
1206                },
1207                Token {
1208                    word: "大桥",
1209                    start: 5,
1210                    end: 7
1211                },
1212                Token {
1213                    word: "长江大桥",
1214                    start: 3,
1215                    end: 7
1216                }
1217            ]
1218        );
1219
1220        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1221        assert_eq!(
1222            tokens,
1223            vec![
1224                Token {
1225                    word: "我们",
1226                    start: 0,
1227                    end: 2
1228                },
1229                Token {
1230                    word: "中",
1231                    start: 2,
1232                    end: 3
1233                },
1234                Token {
1235                    word: "出",
1236                    start: 3,
1237                    end: 4
1238                },
1239                Token {
1240                    word: "了",
1241                    start: 4,
1242                    end: 5
1243                },
1244                Token {
1245                    word: "一个",
1246                    start: 5,
1247                    end: 7
1248                },
1249                Token {
1250                    word: "叛徒",
1251                    start: 7,
1252                    end: 9
1253                }
1254            ]
1255        );
1256        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1257        assert_eq!(
1258            tokens,
1259            vec![
1260                Token {
1261                    word: "我们",
1262                    start: 0,
1263                    end: 2
1264                },
1265                Token {
1266                    word: "中出",
1267                    start: 2,
1268                    end: 4
1269                },
1270                Token {
1271                    word: "了",
1272                    start: 4,
1273                    end: 5
1274                },
1275                Token {
1276                    word: "一个",
1277                    start: 5,
1278                    end: 7
1279                },
1280                Token {
1281                    word: "叛徒",
1282                    start: 7,
1283                    end: 9
1284                }
1285            ]
1286        );
1287
1288        let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1289        assert_eq!(
1290            tokens,
1291            vec![
1292                Token {
1293                    word: "永和",
1294                    start: 0,
1295                    end: 2
1296                },
1297                Token {
1298                    word: "服装",
1299                    start: 2,
1300                    end: 4
1301                },
1302                Token {
1303                    word: "饰品",
1304                    start: 4,
1305                    end: 6
1306                },
1307                Token {
1308                    word: "有限公司",
1309                    start: 6,
1310                    end: 10
1311                }
1312            ]
1313        );
1314    }
1315
1316    #[test]
1317    fn test_userdict() {
1318        let mut jieba = Jieba::new();
1319        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1320        assert_eq!(
1321            tokens,
1322            vec![
1323                Token {
1324                    word: "我们",
1325                    start: 0,
1326                    end: 2
1327                },
1328                Token {
1329                    word: "中",
1330                    start: 2,
1331                    end: 3
1332                },
1333                Token {
1334                    word: "出",
1335                    start: 3,
1336                    end: 4
1337                },
1338                Token {
1339                    word: "了",
1340                    start: 4,
1341                    end: 5
1342                },
1343                Token {
1344                    word: "一个",
1345                    start: 5,
1346                    end: 7
1347                },
1348                Token {
1349                    word: "叛徒",
1350                    start: 7,
1351                    end: 9
1352                }
1353            ]
1354        );
1355        let userdict = "中出 10000";
1356        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1357        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1358        assert_eq!(
1359            tokens,
1360            vec![
1361                Token {
1362                    word: "我们",
1363                    start: 0,
1364                    end: 2
1365                },
1366                Token {
1367                    word: "中出",
1368                    start: 2,
1369                    end: 4
1370                },
1371                Token {
1372                    word: "了",
1373                    start: 4,
1374                    end: 5
1375                },
1376                Token {
1377                    word: "一个",
1378                    start: 5,
1379                    end: 7
1380                },
1381                Token {
1382                    word: "叛徒",
1383                    start: 7,
1384                    end: 9
1385                }
1386            ]
1387        );
1388    }
1389
1390    #[test]
1391    fn test_userdict_hmm() {
1392        let mut jieba = Jieba::new();
1393        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1394        assert_eq!(
1395            tokens,
1396            vec![
1397                Token {
1398                    word: "我们",
1399                    start: 0,
1400                    end: 2
1401                },
1402                Token {
1403                    word: "中出",
1404                    start: 2,
1405                    end: 4
1406                },
1407                Token {
1408                    word: "了",
1409                    start: 4,
1410                    end: 5
1411                },
1412                Token {
1413                    word: "一个",
1414                    start: 5,
1415                    end: 7
1416                },
1417                Token {
1418                    word: "叛徒",
1419                    start: 7,
1420                    end: 9
1421                }
1422            ]
1423        );
1424        let userdict = "出了 10000";
1425        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1426        let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1427        assert_eq!(
1428            tokens,
1429            vec![
1430                Token {
1431                    word: "我们",
1432                    start: 0,
1433                    end: 2
1434                },
1435                Token {
1436                    word: "中",
1437                    start: 2,
1438                    end: 3
1439                },
1440                Token {
1441                    word: "出了",
1442                    start: 3,
1443                    end: 5
1444                },
1445                Token {
1446                    word: "一个",
1447                    start: 5,
1448                    end: 7
1449                },
1450                Token {
1451                    word: "叛徒",
1452                    start: 7,
1453                    end: 9
1454                }
1455            ]
1456        );
1457    }
1458
1459    #[test]
1460    fn test_userdict_error() {
1461        let mut jieba = Jieba::empty();
1462        let userdict = "出了 not_a_int";
1463        let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1464        assert!(ret.is_err());
1465    }
1466
1467    #[test]
1468    fn test_suggest_freq() {
1469        // NOTE: Following behaviors are aligned with original Jieba
1470
1471        let mut jieba = Jieba::new();
1472        // These values were calculated by original Jieba
1473        assert_eq!(jieba.suggest_freq("中出"), 348);
1474        assert_eq!(jieba.suggest_freq("出了"), 1263);
1475
1476        // Freq in dict.txt was 3, which became 300 after loading user dict
1477        let userdict = "中出 300";
1478        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1479        // But it's less than calculated freq 348
1480        assert_eq!(jieba.suggest_freq("中出"), 348);
1481
1482        let userdict = "中出 500";
1483        jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1484        // Now it's significant enough
1485        assert_eq!(jieba.suggest_freq("中出"), 500)
1486    }
1487
1488    #[test]
1489    fn test_custom_lower_freq() {
1490        let mut jieba = Jieba::new();
1491
1492        jieba.add_word("测试", Some(2445), None);
1493        jieba.add_word("测试", Some(10), None);
1494        let words = jieba.cut("测试", false);
1495        assert_eq!(words, vec!["测试"]);
1496    }
1497
1498    #[test]
1499    fn test_cut_dag_no_hmm_against_string_with_sip() {
1500        let mut jieba = Jieba::empty();
1501
1502        //add fake word into dictionary
1503        jieba.add_word("䶴䶵𦡦", Some(1000), None);
1504        jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
1505
1506        let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
1507        assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]);
1508    }
1509
1510    #[test]
1511    fn test_add_custom_word_with_underscrore() {
1512        let mut jieba = Jieba::empty();
1513        jieba.add_word("田-女士", Some(42), Some("n"));
1514        let words = jieba.cut("市民田-女士急匆匆", false);
1515        assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]);
1516    }
1517}
jieba_rs/lib.rs

jieba_rs/
lib.rs