1use include_flate::flate;
75
76use std::cmp::Ordering;
77use std::collections::HashMap;
78use std::io::BufRead;
79
80use cedarwood::Cedar;
81use regex::{Match, Matches, Regex};
82
83pub(crate) type FxHashMap<K, V> = HashMap<K, V, rustc_hash::FxBuildHasher>;
84
85pub use crate::errors::Error;
86#[cfg(feature = "textrank")]
87pub use crate::keywords::textrank::TextRank;
88#[cfg(feature = "tfidf")]
89pub use crate::keywords::tfidf::TfIdf;
90#[cfg(any(feature = "tfidf", feature = "textrank"))]
91pub use crate::keywords::{DEFAULT_STOP_WORDS, Keyword, KeywordExtract, KeywordExtractConfig};
92
93mod errors;
94mod hmm;
95#[cfg(any(feature = "tfidf", feature = "textrank"))]
96mod keywords;
97mod sparse_dag;
98
99#[cfg(feature = "default-dict")]
100flate!(static DEFAULT_DICT: str from "src/data/dict.txt");
101
102use sparse_dag::StaticSparseDAG;
103
104thread_local! {
105 static RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
106 static RE_SKIP_DEFAULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
107 static RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
108 static RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
109 static HMM_CONTEXT: std::cell::RefCell<hmm::HmmContext> = std::cell::RefCell::new(hmm::HmmContext::default());
110}
111
112struct SplitMatches<'r, 't> {
113 finder: Matches<'r, 't>,
114 text: &'t str,
115 last: usize,
116 matched: Option<Match<'t>>,
117}
118
119impl<'r, 't> SplitMatches<'r, 't> {
120 #[inline]
121 fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> {
122 SplitMatches {
123 finder: re.find_iter(text),
124 text,
125 last: 0,
126 matched: None,
127 }
128 }
129}
130
131#[derive(Debug)]
132pub(crate) enum SplitState<'t> {
133 Unmatched(&'t str),
134 Matched(Match<'t>),
135}
136
137impl<'t> SplitState<'t> {
138 #[inline]
139 fn as_str(&self) -> &'t str {
140 match self {
141 SplitState::Unmatched(t) => t,
142 SplitState::Matched(matched) => matched.as_str(),
143 }
144 }
145
146 #[inline]
147 pub fn is_matched(&self) -> bool {
148 matches!(self, SplitState::Matched(_))
149 }
150}
151
152impl<'t> Iterator for SplitMatches<'_, 't> {
153 type Item = SplitState<'t>;
154
155 fn next(&mut self) -> Option<SplitState<'t>> {
156 if let Some(matched) = self.matched.take() {
157 return Some(SplitState::Matched(matched));
158 }
159 match self.finder.next() {
160 None => {
161 if self.last >= self.text.len() {
162 None
163 } else {
164 let s = &self.text[self.last..];
165 self.last = self.text.len();
166 Some(SplitState::Unmatched(s))
167 }
168 }
169 Some(m) => {
170 if self.last == m.start() {
171 self.last = m.end();
172 Some(SplitState::Matched(m))
173 } else {
174 let unmatched = &self.text[self.last..m.start()];
175 self.last = m.end();
176 self.matched = Some(m);
177 Some(SplitState::Unmatched(unmatched))
178 }
179 }
180 }
181 }
182}
183
184#[derive(Debug, Clone, Copy, PartialEq, Eq)]
185pub enum TokenizeMode {
186 Default,
188 Search,
190}
191
192#[derive(Debug, Clone, PartialEq, Eq, Hash)]
194pub struct Token<'a> {
195 pub word: &'a str,
197 pub start: usize,
199 pub end: usize,
201}
202
203#[derive(Debug, Clone, PartialEq, Eq, Hash)]
205pub struct Tag<'a> {
206 pub word: &'a str,
208 pub tag: &'a str,
210}
211
212#[derive(Debug, Clone)]
213struct Record {
214 freq: usize,
215 tag: String,
216}
217
218impl Record {
219 #[inline(always)]
220 fn new(freq: usize, tag: String) -> Self {
221 Self { freq, tag }
222 }
223}
224
225#[derive(Debug, Clone)]
227pub struct Jieba {
228 records: Vec<Record>,
229 cedar: Cedar,
230 total: usize,
231}
232
233#[cfg(feature = "default-dict")]
234impl Default for Jieba {
235 fn default() -> Self {
236 Jieba::new()
237 }
238}
239
240impl Jieba {
241 pub fn empty() -> Self {
243 Jieba {
244 records: Vec::new(),
245 cedar: Cedar::new(),
246 total: 0,
247 }
248 }
249
250 #[cfg(feature = "default-dict")]
254 pub fn new() -> Self {
255 let mut instance = Self::empty();
256 instance.load_default_dict();
257 instance
258 }
259
260 pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
262 let mut instance = Self::empty();
263 instance.load_dict(dict)?;
264 Ok(instance)
265 }
266
267 #[cfg(feature = "default-dict")]
288 pub fn load_default_dict(&mut self) {
289 use std::io::BufReader;
290
291 let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
292 self.load_dict(&mut default_dict).unwrap();
293 }
294
295 pub fn clear(&mut self) {
317 self.records.clear();
318 self.cedar = Cedar::new();
319 self.total = 0;
320 }
321
322 pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
328 if word.is_empty() {
329 return 0;
330 }
331 let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
332 let tag = tag.unwrap_or("");
333
334 match self.cedar.exact_match_search(word) {
335 Some((word_id, _, _)) => {
336 let old_freq = self.records[word_id as usize].freq;
337 self.records[word_id as usize].freq = freq;
338
339 self.total += freq;
340 self.total -= old_freq;
341 }
342 None => {
343 let word_id = self.records.len() as i32;
344 self.records.push(Record::new(freq, String::from(tag)));
345
346 self.cedar.update(word, word_id);
347 self.total += freq;
348 }
349 };
350
351 freq
352 }
353
354 pub fn has_word(&self, word: &str) -> bool {
364 self.cedar.exact_match_search(word).is_some()
365 }
366
367 pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
391 let mut buf = String::new();
392 self.total = 0;
393
394 let mut line_no = 0;
395 while dict.read_line(&mut buf)? > 0 {
396 {
397 line_no += 1;
398 let mut iter = buf.split_whitespace();
399 if let Some(word) = iter.next() {
400 let freq = iter
401 .next()
402 .map(|x| {
403 x.parse::<usize>().map_err(|e| {
404 Error::InvalidDictEntry(format!(
405 "line {} `{}` frequency {} is not a valid integer: {}",
406 line_no, buf, x, e
407 ))
408 })
409 })
410 .unwrap_or(Ok(0))?;
411 let tag = iter.next().unwrap_or("");
412
413 match self.cedar.exact_match_search(word) {
414 Some((word_id, _, _)) => {
415 self.records[word_id as usize].freq = freq;
416 }
417 None => {
418 let word_id = self.records.len() as i32;
419 self.records.push(Record::new(freq, String::from(tag)));
420 self.cedar.update(word, word_id);
421 }
422 };
423 }
424 }
425 buf.clear();
426 }
427 self.total = self.records.iter().map(|n| n.freq).sum();
428
429 Ok(())
430 }
431
432 fn get_word_freq(&self, word: &str, default: usize) -> usize {
433 match self.cedar.exact_match_search(word) {
434 Some((word_id, _, _)) => self.records[word_id as usize].freq,
435 _ => default,
436 }
437 }
438
439 pub fn suggest_freq(&self, segment: &str) -> usize {
441 let logtotal = (self.total as f64).ln();
442 let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| {
443 freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal
444 });
445 std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
446 }
447
448 #[allow(clippy::ptr_arg)]
449 fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
450 let str_len = sentence.len();
451
452 if str_len + 1 > route.len() {
453 route.resize(str_len + 1, (0.0, 0));
454 }
455
456 let logtotal = (self.total as f64).ln();
457 let mut prev_byte_start = str_len;
458 let curr = sentence.char_indices().map(|x| x.0).rev();
459 for byte_start in curr {
460 let pair = dag
461 .iter_edges(byte_start)
462 .map(|byte_end| {
463 let wfrag = &sentence[byte_start..byte_end];
464
465 let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) {
466 self.records[word_id as usize].freq
467 } else {
468 1
469 };
470
471 ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end)
472 })
473 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
474
475 if let Some(p) = pair {
476 route[byte_start] = p;
477 } else {
478 let byte_end = prev_byte_start;
479 let freq = 1;
480 route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end);
481 }
482
483 prev_byte_start = byte_start;
484 }
485 }
486
487 fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
488 for (byte_start, _) in sentence.char_indices().peekable() {
489 dag.start(byte_start);
490 let haystack = &sentence[byte_start..];
491
492 for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
493 dag.insert(end_index + byte_start + 1);
494 }
495
496 dag.commit();
497 }
498 }
499
500 fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
501 let str_len = sentence.len();
502 let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
503 self.dag(sentence, &mut dag);
504
505 let curr = sentence.char_indices().map(|x| x.0);
506 for byte_start in curr {
507 for byte_end in dag.iter_edges(byte_start) {
508 let word = if byte_end == str_len {
509 &sentence[byte_start..]
510 } else {
511 &sentence[byte_start..byte_end]
512 };
513
514 words.push(word)
515 }
516 }
517 }
518
519 fn cut_dag_no_hmm<'a>(
520 &self,
521 sentence: &'a str,
522 words: &mut Vec<&'a str>,
523 route: &mut Vec<(f64, usize)>,
524 dag: &mut StaticSparseDAG,
525 ) {
526 self.dag(sentence, dag);
527 self.calc(sentence, dag, route);
528 let mut x = 0;
529 let mut left: Option<usize> = None;
530
531 while x < sentence.len() {
532 let y = route[x].1;
533 let l_str = &sentence[x..y];
534
535 if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) {
536 if left.is_none() {
537 left = Some(x);
538 }
539 } else {
540 if let Some(byte_start) = left {
541 let word = &sentence[byte_start..x];
542 words.push(word);
543 left = None;
544 }
545
546 words.push(l_str);
547 }
548 x = y;
549 }
550
551 if let Some(byte_start) = left {
552 let word = &sentence[byte_start..];
553 words.push(word);
554 }
555
556 dag.clear();
557 route.clear();
558 }
559
560 #[allow(non_snake_case, clippy::too_many_arguments)]
561 fn cut_dag_hmm<'a>(
562 &self,
563 sentence: &'a str,
564 words: &mut Vec<&'a str>,
565 route: &mut Vec<(f64, usize)>,
566 dag: &mut StaticSparseDAG,
567 hmm_context: &mut hmm::HmmContext,
568 ) {
569 self.dag(sentence, dag);
570 self.calc(sentence, dag, route);
571 let mut x = 0;
572 let mut left: Option<usize> = None;
573
574 while x < sentence.len() {
575 let y = route[x].1;
576
577 if sentence[x..y].chars().count() == 1 {
578 if left.is_none() {
579 left = Some(x);
580 }
581 } else {
582 if let Some(byte_start) = left {
583 let byte_end = x;
584 let word = &sentence[byte_start..byte_end];
585 if word.chars().count() == 1 {
586 words.push(word);
587 } else if self.cedar.exact_match_search(word).is_none() {
588 hmm::cut_with_allocated_memory(word, words, hmm_context);
589 } else {
590 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
591 while let Some(byte_start) = word_indices.next() {
592 if let Some(byte_end) = word_indices.peek() {
593 words.push(&word[byte_start..*byte_end]);
594 } else {
595 words.push(&word[byte_start..]);
596 }
597 }
598 }
599 left = None;
600 }
601 let word = &sentence[x..y];
602 words.push(word);
603 }
604 x = y;
605 }
606
607 if let Some(byte_start) = left {
608 let word = &sentence[byte_start..];
609
610 if word.chars().count() == 1 {
611 words.push(word);
612 } else if self.cedar.exact_match_search(word).is_none() {
613 hmm::cut(word, words);
614 } else {
615 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
616 while let Some(byte_start) = word_indices.next() {
617 if let Some(byte_end) = word_indices.peek() {
618 words.push(&word[byte_start..*byte_end]);
619 } else {
620 words.push(&word[byte_start..]);
621 }
622 }
623 }
624 }
625
626 dag.clear();
627 route.clear();
628 }
629
630 #[allow(non_snake_case)]
631 fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
632 let re_han = if cut_all { &RE_HAN_CUT_ALL } else { &RE_HAN_DEFAULT };
633 let re_skip = if cut_all { &RE_SKIP_CUT_ALL } else { &RE_SKIP_DEFAULT };
634
635 re_han.with(|re_han| {
636 re_skip.with(|re_skip| {
637 let heuristic_capacity = sentence.len() / 2;
638 let mut words = Vec::with_capacity(heuristic_capacity);
639
640 let splitter = SplitMatches::new(re_han, sentence);
641 let mut route = Vec::with_capacity(heuristic_capacity);
642 let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
643
644 for state in splitter {
645 match state {
646 SplitState::Matched(_) => {
647 let block = state.as_str();
648 assert!(!block.is_empty());
649
650 if cut_all {
651 self.cut_all_internal(block, &mut words);
652 } else if hmm {
653 HMM_CONTEXT.with(|ctx| {
654 let mut hmm_context = ctx.borrow_mut();
655 self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut hmm_context);
656 });
657 } else {
658 self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
659 }
660 }
661 SplitState::Unmatched(_) => {
662 let block = state.as_str();
663 assert!(!block.is_empty());
664
665 let skip_splitter = SplitMatches::new(re_skip, block);
666 for skip_state in skip_splitter {
667 let word = skip_state.as_str();
668 if word.is_empty() {
669 continue;
670 }
671 if cut_all || skip_state.is_matched() {
672 words.push(word);
673 } else {
674 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
675 while let Some(byte_start) = word_indices.next() {
676 if let Some(byte_end) = word_indices.peek() {
677 words.push(&word[byte_start..*byte_end]);
678 } else {
679 words.push(&word[byte_start..]);
680 }
681 }
682 }
683 }
684 }
685 }
686 }
687 words
688 })
689 })
690 }
691
692 pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
700 self.cut_internal(sentence, false, hmm)
701 }
702
703 pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
709 self.cut_internal(sentence, true, false)
710 }
711
712 pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
720 let words = self.cut(sentence, hmm);
721 let mut new_words = Vec::with_capacity(words.len());
722 for word in words {
723 let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
724 let char_count = char_indices.len();
725 if char_count > 2 {
726 for i in 0..char_count - 1 {
727 let byte_start = char_indices[i];
728 let gram2 = if i + 2 < char_count {
729 &word[byte_start..char_indices[i + 2]]
730 } else {
731 &word[byte_start..]
732 };
733 if self.cedar.exact_match_search(gram2).is_some() {
734 new_words.push(gram2);
735 }
736 }
737 }
738 if char_count > 3 {
739 for i in 0..char_count - 2 {
740 let byte_start = char_indices[i];
741 let gram3 = if i + 3 < char_count {
742 &word[byte_start..char_indices[i + 3]]
743 } else {
744 &word[byte_start..]
745 };
746 if self.cedar.exact_match_search(gram3).is_some() {
747 new_words.push(gram3);
748 }
749 }
750 }
751 new_words.push(word);
752 }
753 new_words
754 }
755
756 pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
766 let words = self.cut(sentence, hmm);
767 let mut tokens = Vec::with_capacity(words.len());
768 let mut start = 0;
769 match mode {
770 TokenizeMode::Default => {
771 for word in words {
772 let width = word.chars().count();
773 tokens.push(Token {
774 word,
775 start,
776 end: start + width,
777 });
778 start += width;
779 }
780 }
781 TokenizeMode::Search => {
782 for word in words {
783 let width = word.chars().count();
784 if width > 2 {
785 let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
786 for i in 0..width - 1 {
787 let byte_start = char_indices[i];
788 let gram2 = if i + 2 < width {
789 &word[byte_start..char_indices[i + 2]]
790 } else {
791 &word[byte_start..]
792 };
793 if self.cedar.exact_match_search(gram2).is_some() {
794 tokens.push(Token {
795 word: gram2,
796 start: start + i,
797 end: start + i + 2,
798 });
799 }
800 }
801 if width > 3 {
802 for i in 0..width - 2 {
803 let byte_start = char_indices[i];
804 let gram3 = if i + 3 < width {
805 &word[byte_start..char_indices[i + 3]]
806 } else {
807 &word[byte_start..]
808 };
809 if self.cedar.exact_match_search(gram3).is_some() {
810 tokens.push(Token {
811 word: gram3,
812 start: start + i,
813 end: start + i + 3,
814 });
815 }
816 }
817 }
818 }
819 tokens.push(Token {
820 word,
821 start,
822 end: start + width,
823 });
824 start += width;
825 }
826 }
827 }
828 tokens
829 }
830
831 pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag<'a>> {
839 let words = self.cut(sentence, hmm);
840 words
841 .into_iter()
842 .map(|word| {
843 if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
844 let t = &self.records[word_id as usize].tag;
845 return Tag { word, tag: t };
846 }
847 let mut eng = 0;
848 let mut m = 0;
849 for chr in word.chars() {
850 if chr.is_ascii_alphanumeric() {
851 eng += 1;
852 if chr.is_ascii_digit() {
853 m += 1;
854 }
855 }
856 }
857 let tag = if eng == 0 {
858 "x"
859 } else if eng == m {
860 "m"
861 } else {
862 "eng"
863 };
864 Tag { word, tag }
865 })
866 .collect()
867 }
868}
869
870#[cfg(test)]
871mod tests {
872 use super::{Jieba, RE_HAN_DEFAULT, SplitMatches, SplitState, Tag, Token, TokenizeMode};
873 use std::io::BufReader;
874
875 #[test]
876 fn test_init_with_default_dict() {
877 let _ = Jieba::new();
878 }
879
880 #[test]
881 fn test_has_word() {
882 let jieba = Jieba::new();
883 assert!(jieba.has_word("中国"));
884 assert!(jieba.has_word("开源"));
885 assert!(!jieba.has_word("不存在的词"));
886 }
887
888 #[test]
889 fn test_split_matches() {
890 RE_HAN_DEFAULT.with(|re_han| {
891 let splitter = SplitMatches::new(
892 re_han,
893 "👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
894 );
895 for state in splitter {
896 match state {
897 SplitState::Matched(_) => {
898 let block = state.as_str();
899 assert!(!block.is_empty());
900 }
901 SplitState::Unmatched(_) => {
902 let block = state.as_str();
903 assert!(!block.is_empty());
904 }
905 }
906 }
907 });
908 }
909
910 #[test]
911 fn test_split_matches_against_unicode_sip() {
912 RE_HAN_DEFAULT.with(|re_han| {
913 let splitter = SplitMatches::new(re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
914
915 let result: Vec<&str> = splitter.map(|x| x.as_str()).collect();
916 assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
917 });
918 }
919
920 #[test]
921 fn test_cut_all() {
922 let jieba = Jieba::new();
923 let words = jieba.cut_all("abc网球拍卖会def");
924 assert_eq!(
925 words,
926 vec![
927 "abc",
928 "网",
929 "网球",
930 "网球拍",
931 "球",
932 "球拍",
933 "拍",
934 "拍卖",
935 "拍卖会",
936 "卖",
937 "会",
938 "def"
939 ]
940 );
941
942 let words = jieba.cut_all("我来到北京清华大学");
946 assert_eq!(
947 words,
948 vec![
949 "我",
950 "来",
951 "来到",
952 "到",
953 "北",
954 "北京",
955 "京",
956 "清",
957 "清华",
958 "清华大学",
959 "华",
960 "华大",
961 "大",
962 "大学",
963 "学"
964 ]
965 );
966 }
967
968 #[test]
969 fn test_cut_no_hmm() {
970 let jieba = Jieba::new();
971 let words = jieba.cut("abc网球拍卖会def", false);
972 assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]);
973 }
974
975 #[test]
976 fn test_cut_no_hmm1() {
977 let jieba = Jieba::new();
978 let words = jieba.cut("abc网球拍卖会def!!?\r\n\t", false);
979 assert_eq!(
980 words,
981 vec!["abc", "网球", "拍卖会", "def", "!", "!", "?", "\r\n", "\t"]
982 );
983 }
984
985 #[test]
986 fn test_cut_with_hmm() {
987 let jieba = Jieba::new();
988 let words = jieba.cut("我们中出了一个叛徒", false);
989 assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
990 let words = jieba.cut("我们中出了一个叛徒", true);
991 assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]);
992 let words = jieba.cut("我们中出了一个叛徒👪", true);
993 assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]);
994
995 let words = jieba.cut("我来到北京清华大学", true);
996 assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]);
997
998 let words = jieba.cut("他来到了网易杭研大厦", true);
999 assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]);
1000 }
1001
1002 #[test]
1003 fn test_cut_weicheng() {
1004 static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt");
1005 let jieba = Jieba::new();
1006 for line in WEICHENG_TXT.split('\n') {
1007 let _ = jieba.cut(line, true);
1008 }
1009 }
1010
1011 #[test]
1012 fn test_cut_for_search() {
1013 let jieba = Jieba::new();
1014 let words = jieba.cut_for_search("南京市长江大桥", true);
1015 assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]);
1016
1017 let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true);
1018
1019 assert_eq!(
1022 words,
1023 vec![
1024 "小明",
1025 "硕士",
1026 "毕业",
1027 "于",
1028 "中国",
1029 "科学",
1030 "学院",
1031 "科学院",
1032 "中国科学院",
1033 "计算",
1034 "计算所",
1035 ",",
1036 "后",
1037 "在",
1038 "日本",
1039 "京都",
1040 "大学",
1041 "日本京都大学",
1042 "深造"
1043 ]
1044 );
1045 }
1046
1047 #[test]
1048 fn test_tag() {
1049 let jieba = Jieba::new();
1050 let tags = jieba.tag(
1051 "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。",
1052 true,
1053 );
1054 assert_eq!(
1055 tags,
1056 vec![
1057 Tag { word: "我", tag: "r" },
1058 Tag { word: "是", tag: "v" },
1059 Tag {
1060 word: "拖拉机",
1061 tag: "n"
1062 },
1063 Tag {
1064 word: "学院", tag: "n"
1065 },
1066 Tag {
1067 word: "手扶拖拉机",
1068 tag: "n"
1069 },
1070 Tag {
1071 word: "专业", tag: "n"
1072 },
1073 Tag { word: "的", tag: "uj" },
1074 Tag { word: "。", tag: "x" },
1075 Tag {
1076 word: "不用", tag: "v"
1077 },
1078 Tag {
1079 word: "多久", tag: "m"
1080 },
1081 Tag { word: ",", tag: "x" },
1082 Tag { word: "我", tag: "r" },
1083 Tag { word: "就", tag: "d" },
1084 Tag { word: "会", tag: "v" },
1085 Tag {
1086 word: "升职", tag: "v"
1087 },
1088 Tag {
1089 word: "加薪",
1090 tag: "nr"
1091 },
1092 Tag { word: ",", tag: "x" },
1093 Tag {
1094 word: "当上", tag: "t"
1095 },
1096 Tag {
1097 word: "CEO",
1098 tag: "eng"
1099 },
1100 Tag { word: ",", tag: "x" },
1101 Tag {
1102 word: "走上", tag: "v"
1103 },
1104 Tag {
1105 word: "人生", tag: "n"
1106 },
1107 Tag {
1108 word: "巅峰", tag: "n"
1109 },
1110 Tag { word: "。", tag: "x" }
1111 ]
1112 );
1113
1114 let tags = jieba.tag("今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1115 assert_eq!(
1116 tags,
1117 vec![
1118 Tag {
1119 word: "今天", tag: "t"
1120 },
1121 Tag {
1122 word: "纽约",
1123 tag: "ns"
1124 },
1125 Tag { word: "的", tag: "uj" },
1126 Tag {
1127 word: "天气", tag: "n"
1128 },
1129 Tag {
1130 word: "真好", tag: "d"
1131 },
1132 Tag { word: "啊", tag: "zg" },
1133 Tag { word: ",", tag: "x" },
1134 Tag {
1135 word: "京华",
1136 tag: "nz"
1137 },
1138 Tag {
1139 word: "大酒店",
1140 tag: "n"
1141 },
1142 Tag { word: "的", tag: "uj" },
1143 Tag {
1144 word: "张尧", tag: "x"
1145 }, Tag {
1147 word: "经理", tag: "n"
1148 },
1149 Tag { word: "吃", tag: "v" },
1150 Tag { word: "了", tag: "ul" },
1151 Tag {
1152 word: "一只", tag: "m"
1153 },
1154 Tag {
1155 word: "北京烤鸭",
1156 tag: "n"
1157 },
1158 Tag { word: "。", tag: "x" }
1159 ]
1160 );
1161 }
1162
1163 #[test]
1164 fn test_tokenize() {
1165 let jieba = Jieba::new();
1166 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1167 assert_eq!(
1168 tokens,
1169 vec![
1170 Token {
1171 word: "南京市",
1172 start: 0,
1173 end: 3
1174 },
1175 Token {
1176 word: "长江大桥",
1177 start: 3,
1178 end: 7
1179 }
1180 ]
1181 );
1182
1183 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1184 assert_eq!(
1185 tokens,
1186 vec![
1187 Token {
1188 word: "南京",
1189 start: 0,
1190 end: 2
1191 },
1192 Token {
1193 word: "京市",
1194 start: 1,
1195 end: 3
1196 },
1197 Token {
1198 word: "南京市",
1199 start: 0,
1200 end: 3
1201 },
1202 Token {
1203 word: "长江",
1204 start: 3,
1205 end: 5
1206 },
1207 Token {
1208 word: "大桥",
1209 start: 5,
1210 end: 7
1211 },
1212 Token {
1213 word: "长江大桥",
1214 start: 3,
1215 end: 7
1216 }
1217 ]
1218 );
1219
1220 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1221 assert_eq!(
1222 tokens,
1223 vec![
1224 Token {
1225 word: "我们",
1226 start: 0,
1227 end: 2
1228 },
1229 Token {
1230 word: "中",
1231 start: 2,
1232 end: 3
1233 },
1234 Token {
1235 word: "出",
1236 start: 3,
1237 end: 4
1238 },
1239 Token {
1240 word: "了",
1241 start: 4,
1242 end: 5
1243 },
1244 Token {
1245 word: "一个",
1246 start: 5,
1247 end: 7
1248 },
1249 Token {
1250 word: "叛徒",
1251 start: 7,
1252 end: 9
1253 }
1254 ]
1255 );
1256 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1257 assert_eq!(
1258 tokens,
1259 vec![
1260 Token {
1261 word: "我们",
1262 start: 0,
1263 end: 2
1264 },
1265 Token {
1266 word: "中出",
1267 start: 2,
1268 end: 4
1269 },
1270 Token {
1271 word: "了",
1272 start: 4,
1273 end: 5
1274 },
1275 Token {
1276 word: "一个",
1277 start: 5,
1278 end: 7
1279 },
1280 Token {
1281 word: "叛徒",
1282 start: 7,
1283 end: 9
1284 }
1285 ]
1286 );
1287
1288 let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1289 assert_eq!(
1290 tokens,
1291 vec![
1292 Token {
1293 word: "永和",
1294 start: 0,
1295 end: 2
1296 },
1297 Token {
1298 word: "服装",
1299 start: 2,
1300 end: 4
1301 },
1302 Token {
1303 word: "饰品",
1304 start: 4,
1305 end: 6
1306 },
1307 Token {
1308 word: "有限公司",
1309 start: 6,
1310 end: 10
1311 }
1312 ]
1313 );
1314 }
1315
1316 #[test]
1317 fn test_userdict() {
1318 let mut jieba = Jieba::new();
1319 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1320 assert_eq!(
1321 tokens,
1322 vec![
1323 Token {
1324 word: "我们",
1325 start: 0,
1326 end: 2
1327 },
1328 Token {
1329 word: "中",
1330 start: 2,
1331 end: 3
1332 },
1333 Token {
1334 word: "出",
1335 start: 3,
1336 end: 4
1337 },
1338 Token {
1339 word: "了",
1340 start: 4,
1341 end: 5
1342 },
1343 Token {
1344 word: "一个",
1345 start: 5,
1346 end: 7
1347 },
1348 Token {
1349 word: "叛徒",
1350 start: 7,
1351 end: 9
1352 }
1353 ]
1354 );
1355 let userdict = "中出 10000";
1356 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1357 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1358 assert_eq!(
1359 tokens,
1360 vec![
1361 Token {
1362 word: "我们",
1363 start: 0,
1364 end: 2
1365 },
1366 Token {
1367 word: "中出",
1368 start: 2,
1369 end: 4
1370 },
1371 Token {
1372 word: "了",
1373 start: 4,
1374 end: 5
1375 },
1376 Token {
1377 word: "一个",
1378 start: 5,
1379 end: 7
1380 },
1381 Token {
1382 word: "叛徒",
1383 start: 7,
1384 end: 9
1385 }
1386 ]
1387 );
1388 }
1389
1390 #[test]
1391 fn test_userdict_hmm() {
1392 let mut jieba = Jieba::new();
1393 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1394 assert_eq!(
1395 tokens,
1396 vec![
1397 Token {
1398 word: "我们",
1399 start: 0,
1400 end: 2
1401 },
1402 Token {
1403 word: "中出",
1404 start: 2,
1405 end: 4
1406 },
1407 Token {
1408 word: "了",
1409 start: 4,
1410 end: 5
1411 },
1412 Token {
1413 word: "一个",
1414 start: 5,
1415 end: 7
1416 },
1417 Token {
1418 word: "叛徒",
1419 start: 7,
1420 end: 9
1421 }
1422 ]
1423 );
1424 let userdict = "出了 10000";
1425 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1426 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1427 assert_eq!(
1428 tokens,
1429 vec![
1430 Token {
1431 word: "我们",
1432 start: 0,
1433 end: 2
1434 },
1435 Token {
1436 word: "中",
1437 start: 2,
1438 end: 3
1439 },
1440 Token {
1441 word: "出了",
1442 start: 3,
1443 end: 5
1444 },
1445 Token {
1446 word: "一个",
1447 start: 5,
1448 end: 7
1449 },
1450 Token {
1451 word: "叛徒",
1452 start: 7,
1453 end: 9
1454 }
1455 ]
1456 );
1457 }
1458
1459 #[test]
1460 fn test_userdict_error() {
1461 let mut jieba = Jieba::empty();
1462 let userdict = "出了 not_a_int";
1463 let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1464 assert!(ret.is_err());
1465 }
1466
1467 #[test]
1468 fn test_suggest_freq() {
1469 let mut jieba = Jieba::new();
1472 assert_eq!(jieba.suggest_freq("中出"), 348);
1474 assert_eq!(jieba.suggest_freq("出了"), 1263);
1475
1476 let userdict = "中出 300";
1478 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1479 assert_eq!(jieba.suggest_freq("中出"), 348);
1481
1482 let userdict = "中出 500";
1483 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1484 assert_eq!(jieba.suggest_freq("中出"), 500)
1486 }
1487
1488 #[test]
1489 fn test_custom_lower_freq() {
1490 let mut jieba = Jieba::new();
1491
1492 jieba.add_word("测试", Some(2445), None);
1493 jieba.add_word("测试", Some(10), None);
1494 let words = jieba.cut("测试", false);
1495 assert_eq!(words, vec!["测试"]);
1496 }
1497
1498 #[test]
1499 fn test_cut_dag_no_hmm_against_string_with_sip() {
1500 let mut jieba = Jieba::empty();
1501
1502 jieba.add_word("䶴䶵𦡦", Some(1000), None);
1504 jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
1505
1506 let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
1507 assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]);
1508 }
1509
1510 #[test]
1511 fn test_add_custom_word_with_underscrore() {
1512 let mut jieba = Jieba::empty();
1513 jieba.add_word("田-女士", Some(42), Some("n"));
1514 let words = jieba.cut("市民田-女士急匆匆", false);
1515 assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]);
1516 }
1517}