1use lazy_static::lazy_static;
73
74use std::cmp::Ordering;
75use std::io::BufRead;
76
77use cedarwood::Cedar;
78use hashbrown::HashMap;
79use regex::{Match, Matches, Regex};
80
81pub(crate) type FxHashMap<K, V> = HashMap<K, V, fxhash::FxBuildHasher>;
82
83pub use crate::errors::Error;
84#[cfg(feature = "textrank")]
85pub use crate::keywords::textrank::TextRank;
86#[cfg(feature = "tfidf")]
87pub use crate::keywords::tfidf::TFIDF;
88#[cfg(any(feature = "tfidf", feature = "textrank"))]
89pub use crate::keywords::{Keyword, KeywordExtract};
90
91mod errors;
92mod hmm;
93#[cfg(any(feature = "tfidf", feature = "textrank"))]
94mod keywords;
95mod sparse_dag;
96
97#[cfg(feature = "default-dict")]
98static DEFAULT_DICT: &str = include_str!("data/dict.txt");
99
100use sparse_dag::StaticSparseDAG;
101
102lazy_static! {
103 static ref RE_HAN_DEFAULT: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}a-zA-Z0-9+#&\._%\-]+)").unwrap();
104 static ref RE_SKIP_DEAFULT: Regex = Regex::new(r"(\r\n|\s)").unwrap();
105 static ref RE_HAN_CUT_ALL: Regex = Regex::new(r"([\u{3400}-\u{4DBF}\u{4E00}-\u{9FFF}\u{F900}-\u{FAFF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{2F800}-\u{2FA1F}]+)").unwrap();
106 static ref RE_SKIP_CUT_ALL: Regex = Regex::new(r"[^a-zA-Z0-9+#\n]").unwrap();
107}
108
109struct SplitMatches<'r, 't> {
110 finder: Matches<'r, 't>,
111 text: &'t str,
112 last: usize,
113 matched: Option<Match<'t>>,
114}
115
116impl<'r, 't> SplitMatches<'r, 't> {
117 #[inline]
118 fn new(re: &'r Regex, text: &'t str) -> SplitMatches<'r, 't> {
119 SplitMatches {
120 finder: re.find_iter(text),
121 text,
122 last: 0,
123 matched: None,
124 }
125 }
126}
127
128#[derive(Debug)]
129pub(crate) enum SplitState<'t> {
130 Unmatched(&'t str),
131 Matched(Match<'t>),
132}
133
134impl<'t> SplitState<'t> {
135 #[inline]
136 fn into_str(self) -> &'t str {
137 match self {
138 SplitState::Unmatched(t) => t,
139 SplitState::Matched(matched) => matched.as_str(),
140 }
141 }
142}
143
144impl<'r, 't> Iterator for SplitMatches<'r, 't> {
145 type Item = SplitState<'t>;
146
147 fn next(&mut self) -> Option<SplitState<'t>> {
148 if let Some(matched) = self.matched.take() {
149 return Some(SplitState::Matched(matched));
150 }
151 match self.finder.next() {
152 None => {
153 if self.last >= self.text.len() {
154 None
155 } else {
156 let s = &self.text[self.last..];
157 self.last = self.text.len();
158 Some(SplitState::Unmatched(s))
159 }
160 }
161 Some(m) => {
162 if self.last == m.start() {
163 self.last = m.end();
164 Some(SplitState::Matched(m))
165 } else {
166 let unmatched = &self.text[self.last..m.start()];
167 self.last = m.end();
168 self.matched = Some(m);
169 Some(SplitState::Unmatched(unmatched))
170 }
171 }
172 }
173 }
174}
175
176#[derive(Debug, Clone, Copy, PartialEq, Eq)]
177pub enum TokenizeMode {
178 Default,
180 Search,
182}
183
184#[derive(Debug, Clone, PartialEq, Eq, Hash)]
186pub struct Token<'a> {
187 pub word: &'a str,
189 pub start: usize,
191 pub end: usize,
193}
194
195#[derive(Debug, Clone, PartialEq, Eq, Hash)]
197pub struct Tag<'a> {
198 pub word: &'a str,
200 pub tag: &'a str,
202}
203
204#[derive(Debug, Clone)]
205struct Record {
206 freq: usize,
207 tag: String,
208}
209
210impl Record {
211 #[inline(always)]
212 fn new(freq: usize, tag: String) -> Self {
213 Self { freq, tag }
214 }
215}
216
217#[derive(Debug, Clone)]
219pub struct Jieba {
220 records: Vec<Record>,
221 cedar: Cedar,
222 total: usize,
223 longest_word_len: usize,
224 re_han: Regex,
225 re_skip: Regex,
226 re_han_cut: Regex,
227 re_skip_cut: Regex,
228}
229
230#[cfg(feature = "default-dict")]
231impl Default for Jieba {
232 fn default() -> Self {
233 Jieba::new()
234 }
235}
236
237impl Jieba {
238 pub fn empty() -> Self {
240 Jieba {
241 records: Vec::new(),
242 cedar: Cedar::new(),
243 total: 0,
244 longest_word_len: 0,
245 re_han: RE_HAN_DEFAULT.clone(),
246 re_skip: RE_SKIP_DEAFULT.clone(),
247 re_han_cut: RE_HAN_CUT_ALL.clone(),
248 re_skip_cut: RE_SKIP_CUT_ALL.clone(),
249 }
250 }
251
252 #[cfg(feature = "default-dict")]
256 pub fn new() -> Self {
257 use std::io::BufReader;
258
259 let mut instance = Self::empty();
260 let mut default_dict = BufReader::new(DEFAULT_DICT.as_bytes());
261 instance.load_dict(&mut default_dict).unwrap();
262 instance
263 }
264
265 pub fn set_re_han(&mut self, re: Regex) {
267 self.re_han = re;
268 }
269
270 pub fn set_re_han_cut(&mut self, re: Regex) {
272 self.re_han_cut = re;
273 }
274
275 pub fn set_re_skip(&mut self, re: Regex) {
277 self.re_skip = re;
278 }
279
280 pub fn set_re_skip_cut(&mut self, re: Regex) {
282 self.re_skip_cut = re;
283 }
284
285 pub fn with_dict<R: BufRead>(dict: &mut R) -> Result<Self, Error> {
287 let mut instance = Self::empty();
288 instance.load_dict(dict)?;
289 Ok(instance)
290 }
291 pub fn add_word(&mut self, word: &str, freq: Option<usize>, tag: Option<&str>) -> usize {
297 let freq = freq.unwrap_or_else(|| self.suggest_freq(word));
298 let tag = tag.unwrap_or("");
299
300 match self.cedar.exact_match_search(word) {
301 Some((word_id, _, _)) => {
302 let old_freq = self.records[word_id as usize].freq;
303 self.records[word_id as usize].freq = freq;
304
305 self.total += freq;
306 self.total -= old_freq;
307 }
308 None => {
309 self.records.push(Record::new(freq, String::from(tag)));
310 let word_id = (self.records.len() - 1) as i32;
311
312 self.cedar.update(word, word_id);
313 self.total += freq;
314 }
315 };
316
317 let curr_word_len = word.chars().count();
318 if self.longest_word_len < curr_word_len {
319 self.longest_word_len = curr_word_len;
320 }
321
322 freq
323 }
324
325 pub fn load_dict<R: BufRead>(&mut self, dict: &mut R) -> Result<(), Error> {
327 let mut buf = String::new();
328 self.total = 0;
329 self.longest_word_len = 0;
330
331 let mut line_no = 0;
332 while dict.read_line(&mut buf)? > 0 {
333 {
334 line_no += 1;
335 let mut iter = buf.trim().split_whitespace();
336 if let Some(word) = iter.next() {
337 let freq = iter
338 .next()
339 .map(|x| {
340 x.parse::<usize>().map_err(|e| {
341 Error::InvalidDictEntry(format!(
342 "line {} `{}` frequency {} is not a valid integer: {}",
343 line_no, buf, x, e
344 ))
345 })
346 })
347 .unwrap_or(Ok(0))?;
348 let tag = iter.next().unwrap_or("");
349
350 let curr_word_len = word.chars().count();
351 if self.longest_word_len < curr_word_len {
352 self.longest_word_len = curr_word_len;
353 }
354
355 match self.cedar.exact_match_search(word) {
356 Some((word_id, _, _)) => {
357 self.records[word_id as usize].freq = freq;
358 }
359 None => {
360 self.records.push(Record::new(freq, String::from(tag)));
361 let word_id = (self.records.len() - 1) as i32;
362 self.cedar.update(word, word_id);
363 }
364 };
365 }
366 }
367 buf.clear();
368 }
369 self.total = self.records.iter().map(|n| n.freq).sum();
370
371 Ok(())
372 }
373
374 fn get_word_freq(&self, word: &str, default: usize) -> usize {
375 match self.cedar.exact_match_search(word) {
376 Some((word_id, _, _)) => self.records[word_id as usize].freq,
377 _ => default,
378 }
379 }
380
381 pub fn suggest_freq(&self, segment: &str) -> usize {
383 let logtotal = (self.total as f64).ln();
384 let logfreq = self.cut(segment, false).iter().fold(0f64, |freq, word| {
385 freq + (self.get_word_freq(word, 1) as f64).ln() - logtotal
386 });
387 std::cmp::max((logfreq + logtotal).exp() as usize + 1, self.get_word_freq(segment, 1))
388 }
389
390 #[allow(clippy::ptr_arg)]
391 fn calc(&self, sentence: &str, dag: &StaticSparseDAG, route: &mut Vec<(f64, usize)>) {
392 let str_len = sentence.len();
393
394 if str_len + 1 > route.len() {
395 route.resize(str_len + 1, (0.0, 0));
396 }
397
398 let logtotal = (self.total as f64).ln();
399 let mut prev_byte_start = str_len;
400 let curr = sentence.char_indices().map(|x| x.0).rev();
401 for byte_start in curr {
402 let pair = dag
403 .iter_edges(byte_start)
404 .map(|byte_end| {
405 let wfrag = if byte_end == str_len {
406 &sentence[byte_start..]
407 } else {
408 &sentence[byte_start..byte_end]
409 };
410
411 let freq = if let Some((word_id, _, _)) = self.cedar.exact_match_search(wfrag) {
412 self.records[word_id as usize].freq
413 } else {
414 1
415 };
416
417 ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end)
418 })
419 .max_by(|x, y| x.partial_cmp(y).unwrap_or(Ordering::Equal));
420
421 if let Some(p) = pair {
422 route[byte_start] = p;
423 } else {
424 let byte_end = prev_byte_start;
425 let freq = 1;
426 route[byte_start] = ((freq as f64).ln() - logtotal + route[byte_end].0, byte_end);
427 }
428
429 prev_byte_start = byte_start;
430 }
431 }
432
433 fn dag(&self, sentence: &str, dag: &mut StaticSparseDAG) {
434 for (byte_start, _) in sentence.char_indices().peekable() {
435 dag.start(byte_start);
436 let haystack = &sentence[byte_start..];
437
438 for (_, end_index) in self.cedar.common_prefix_iter(haystack) {
439 dag.insert(end_index + byte_start + 1);
440 }
441
442 dag.commit();
443 }
444 }
445
446 fn cut_all_internal<'a>(&self, sentence: &'a str, words: &mut Vec<&'a str>) {
447 let str_len = sentence.len();
448 let mut dag = StaticSparseDAG::with_size_hint(sentence.len());
449 self.dag(sentence, &mut dag);
450
451 let curr = sentence.char_indices().map(|x| x.0);
452 for byte_start in curr {
453 for byte_end in dag.iter_edges(byte_start) {
454 let word = if byte_end == str_len {
455 &sentence[byte_start..]
456 } else {
457 &sentence[byte_start..byte_end]
458 };
459
460 words.push(word)
461 }
462 }
463 }
464
465 fn cut_dag_no_hmm<'a>(
466 &self,
467 sentence: &'a str,
468 words: &mut Vec<&'a str>,
469 route: &mut Vec<(f64, usize)>,
470 dag: &mut StaticSparseDAG,
471 ) {
472 self.dag(sentence, dag);
473 self.calc(sentence, dag, route);
474 let mut x = 0;
475 let mut left: Option<usize> = None;
476
477 while x < sentence.len() {
478 let y = route[x].1;
479 let l_str = if y < sentence.len() {
480 &sentence[x..y]
481 } else {
482 &sentence[x..]
483 };
484
485 if l_str.chars().count() == 1 && l_str.chars().all(|ch| ch.is_ascii_alphanumeric()) {
486 if left.is_none() {
487 left = Some(x);
488 }
489 } else {
490 if let Some(byte_start) = left {
491 let word = &sentence[byte_start..x];
492 words.push(word);
493 left = None;
494 }
495
496 let word = if y < sentence.len() {
497 &sentence[x..y]
498 } else {
499 &sentence[x..]
500 };
501
502 words.push(word);
503 }
504 x = y;
505 }
506
507 if let Some(byte_start) = left {
508 let word = &sentence[byte_start..];
509 words.push(word);
510 }
511
512 dag.clear();
513 route.clear();
514 }
515
516 #[allow(non_snake_case, clippy::too_many_arguments)]
517 fn cut_dag_hmm<'a>(
518 &self,
519 sentence: &'a str,
520 words: &mut Vec<&'a str>,
521 route: &mut Vec<(f64, usize)>,
522 dag: &mut StaticSparseDAG,
523 V: &mut Vec<f64>,
524 prev: &mut Vec<Option<hmm::Status>>,
525 path: &mut Vec<hmm::Status>,
526 ) {
527 self.dag(sentence, dag);
528 self.calc(sentence, dag, route);
529 let mut x = 0;
530 let mut left: Option<usize> = None;
531
532 while x < sentence.len() {
533 let y = route[x].1;
534
535 if sentence[x..y].chars().count() == 1 {
536 if left.is_none() {
537 left = Some(x);
538 }
539 } else {
540 if let Some(byte_start) = left {
541 let byte_end = x;
542 let word = if byte_end < sentence.len() {
543 &sentence[byte_start..byte_end]
544 } else {
545 &sentence[byte_start..]
546 };
547
548 if word.chars().count() == 1 {
549 words.push(word);
550 } else if self.cedar.exact_match_search(word).is_none() {
551 hmm::cut_with_allocated_memory(word, words, V, prev, path);
552 } else {
553 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
554 while let Some(byte_start) = word_indices.next() {
555 if let Some(byte_end) = word_indices.peek() {
556 words.push(&word[byte_start..*byte_end]);
557 } else {
558 words.push(&word[byte_start..]);
559 }
560 }
561 }
562 left = None;
563 }
564 let word = if y < sentence.len() {
565 &sentence[x..y]
566 } else {
567 &sentence[x..]
568 };
569 words.push(word);
570 }
571 x = y;
572 }
573
574 if let Some(byte_start) = left {
575 let word = &sentence[byte_start..];
576
577 if word.chars().count() == 1 {
578 words.push(word);
579 } else if self.cedar.exact_match_search(word).is_none() {
580 hmm::cut(word, words);
581 } else {
582 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
583 while let Some(byte_start) = word_indices.next() {
584 if let Some(byte_end) = word_indices.peek() {
585 words.push(&word[byte_start..*byte_end]);
586 } else {
587 words.push(&word[byte_start..]);
588 }
589 }
590 }
591 }
592
593 dag.clear();
594 route.clear();
595 }
596
597 #[allow(non_snake_case)]
598 fn cut_internal<'a>(&self, sentence: &'a str, cut_all: bool, hmm: bool) -> Vec<&'a str> {
599 let heuristic_capacity = sentence.len() / 2;
600 let mut words = Vec::with_capacity(heuristic_capacity);
601 let re_han: &Regex = if cut_all { &self.re_han_cut } else { &self.re_han };
602 let re_skip: &Regex = if cut_all { &self.re_skip_cut } else { &self.re_skip };
603 let splitter = SplitMatches::new(&re_han, sentence);
604 let mut route = Vec::with_capacity(heuristic_capacity);
605 let mut dag = StaticSparseDAG::with_size_hint(heuristic_capacity);
606
607 let R = 4;
608 let C = sentence.chars().count();
609 let mut V = if hmm { vec![0.0; R * C] } else { Vec::new() };
610 let mut prev: Vec<Option<hmm::Status>> = if hmm { vec![None; R * C] } else { Vec::new() };
611 let mut path: Vec<hmm::Status> = if hmm { vec![hmm::Status::B; C] } else { Vec::new() };
612
613 for state in splitter {
614 match state {
615 SplitState::Matched(_) => {
616 let block = state.into_str();
617 assert!(!block.is_empty());
618
619 if cut_all {
620 self.cut_all_internal(block, &mut words);
621 } else if hmm {
622 self.cut_dag_hmm(block, &mut words, &mut route, &mut dag, &mut V, &mut prev, &mut path);
623 } else {
624 self.cut_dag_no_hmm(block, &mut words, &mut route, &mut dag);
625 }
626 }
627 SplitState::Unmatched(_) => {
628 let block = state.into_str();
629 assert!(!block.is_empty());
630
631 let skip_splitter = SplitMatches::new(&re_skip, block);
632 for skip_state in skip_splitter {
633 let word = skip_state.into_str();
634 if word.is_empty() {
635 continue;
636 }
637 if cut_all || re_skip.is_match(word) {
638 words.push(word);
639 } else {
640 let mut word_indices = word.char_indices().map(|x| x.0).peekable();
641 while let Some(byte_start) = word_indices.next() {
642 if let Some(byte_end) = word_indices.peek() {
643 words.push(&word[byte_start..*byte_end]);
644 } else {
645 words.push(&word[byte_start..]);
646 }
647 }
648 }
649 }
650 }
651 }
652 }
653 words
654 }
655
656 pub fn cut<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
664 self.cut_internal(sentence, false, hmm)
665 }
666
667 pub fn cut_all<'a>(&self, sentence: &'a str) -> Vec<&'a str> {
673 self.cut_internal(sentence, true, false)
674 }
675
676 pub fn cut_for_search<'a>(&self, sentence: &'a str, hmm: bool) -> Vec<&'a str> {
684 let words = self.cut(sentence, hmm);
685 let mut new_words = Vec::with_capacity(words.len());
686 for word in words {
687 let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
688 let char_count = char_indices.len();
689 if char_count > 2 {
690 for i in 0..char_count - 1 {
691 let byte_start = char_indices[i];
692 let gram2 = if i + 2 < char_count {
693 &word[byte_start..char_indices[i + 2]]
694 } else {
695 &word[byte_start..]
696 };
697 if self.cedar.exact_match_search(gram2).is_some() {
698 new_words.push(gram2);
699 }
700 }
701 }
702 if char_count > 3 {
703 for i in 0..char_count - 2 {
704 let byte_start = char_indices[i];
705 let gram3 = if i + 3 < char_count {
706 &word[byte_start..char_indices[i + 3]]
707 } else {
708 &word[byte_start..]
709 };
710 if self.cedar.exact_match_search(gram3).is_some() {
711 new_words.push(gram3);
712 }
713 }
714 }
715 new_words.push(word);
716 }
717 new_words
718 }
719
720 pub fn tokenize<'a>(&self, sentence: &'a str, mode: TokenizeMode, hmm: bool) -> Vec<Token<'a>> {
730 let words = self.cut(sentence, hmm);
731 let mut tokens = Vec::with_capacity(words.len());
732 let mut start = 0;
733 match mode {
734 TokenizeMode::Default => {
735 for word in words {
736 let width = word.chars().count();
737 tokens.push(Token {
738 word,
739 start,
740 end: start + width,
741 });
742 start += width;
743 }
744 }
745 TokenizeMode::Search => {
746 for word in words {
747 let width = word.chars().count();
748 if width > 2 {
749 let char_indices: Vec<usize> = word.char_indices().map(|x| x.0).collect();
750 for i in 0..width - 1 {
751 let byte_start = char_indices[i];
752 let gram2 = if i + 2 < width {
753 &word[byte_start..char_indices[i + 2]]
754 } else {
755 &word[byte_start..]
756 };
757 if self.cedar.exact_match_search(gram2).is_some() {
758 tokens.push(Token {
759 word: gram2,
760 start: start + i,
761 end: start + i + 2,
762 });
763 }
764 }
765 if width > 3 {
766 for i in 0..width - 2 {
767 let byte_start = char_indices[i];
768 let gram3 = if i + 3 < width {
769 &word[byte_start..char_indices[i + 3]]
770 } else {
771 &word[byte_start..]
772 };
773 if self.cedar.exact_match_search(gram3).is_some() {
774 tokens.push(Token {
775 word: gram3,
776 start: start + i,
777 end: start + i + 3,
778 });
779 }
780 }
781 }
782 }
783 tokens.push(Token {
784 word,
785 start,
786 end: start + width,
787 });
788 start += width;
789 }
790 }
791 }
792 tokens
793 }
794
795 pub fn tag<'a>(&'a self, sentence: &'a str, hmm: bool) -> Vec<Tag> {
803 let words = self.cut(sentence, hmm);
804 words
805 .into_iter()
806 .map(|word| {
807 if let Some((word_id, _, _)) = self.cedar.exact_match_search(word) {
808 let t = &self.records[word_id as usize].tag;
809 return Tag { word, tag: t };
810 }
811 let mut eng = 0;
812 let mut m = 0;
813 for chr in word.chars() {
814 if chr.is_ascii_alphanumeric() {
815 eng += 1;
816 if chr.is_ascii_digit() {
817 m += 1;
818 }
819 }
820 }
821 let tag = if eng == 0 {
822 "x"
823 } else if eng == m {
824 "m"
825 } else {
826 "eng"
827 };
828 Tag { word, tag }
829 })
830 .collect()
831 }
832}
833
834#[cfg(test)]
835mod tests {
836 use super::{Jieba, SplitMatches, SplitState, Tag, Token, TokenizeMode, RE_HAN_DEFAULT};
837 use std::io::BufReader;
838
839 #[test]
840 fn test_init_with_default_dict() {
841 let _ = Jieba::new();
842 }
843
844 #[test]
845 fn test_split_matches() {
846 let re_han = &*RE_HAN_DEFAULT;
847 let splitter = SplitMatches::new(
848 &re_han,
849 "👪 PS: 我觉得开源有一个好处,就是能够敦促自己不断改进 👪,避免敞帚自珍",
850 );
851 for state in splitter {
852 match state {
853 SplitState::Matched(_) => {
854 let block = state.into_str();
855 assert_eq!(block.is_empty(), false);
856 }
857 SplitState::Unmatched(_) => {
858 let block = state.into_str();
859 assert_eq!(block.is_empty(), false);
860 }
861 }
862 }
863 }
864
865 #[test]
866 fn test_split_matches_against_unicode_sip() {
867 let re_han = &*RE_HAN_DEFAULT;
868 let splitter = SplitMatches::new(&re_han, "讥䶯䶰䶱䶲䶳䶴䶵𦡦");
869
870 let result: Vec<&str> = splitter.map(|x| x.into_str()).collect();
871 assert_eq!(result, vec!["讥䶯䶰䶱䶲䶳䶴䶵𦡦"]);
872 }
873
874 #[test]
875 fn test_cut_all() {
876 let jieba = Jieba::new();
877 let words = jieba.cut_all("abc网球拍卖会def");
878 assert_eq!(
879 words,
880 vec![
881 "abc",
882 "网",
883 "网球",
884 "网球拍",
885 "球",
886 "球拍",
887 "拍",
888 "拍卖",
889 "拍卖会",
890 "卖",
891 "会",
892 "def",
893 ]
894 );
895
896 let words = jieba.cut_all("我来到北京清华大学");
900 assert_eq!(
901 words,
902 vec![
903 "我",
904 "来",
905 "来到",
906 "到",
907 "北",
908 "北京",
909 "京",
910 "清",
911 "清华",
912 "清华大学",
913 "华",
914 "华大",
915 "大",
916 "大学",
917 "学",
918 ]
919 );
920 }
921
922 #[test]
923 fn test_cut_no_hmm() {
924 let jieba = Jieba::new();
925 let words = jieba.cut("abc网球拍卖会def", false);
926 assert_eq!(words, vec!["abc", "网球", "拍卖会", "def"]);
927 }
928
929 #[test]
930 fn test_cut_with_hmm() {
931 let jieba = Jieba::new();
932 let words = jieba.cut("我们中出了一个叛徒", false);
933 assert_eq!(words, vec!["我们", "中", "出", "了", "一个", "叛徒"]);
934 let words = jieba.cut("我们中出了一个叛徒", true);
935 assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒"]);
936 let words = jieba.cut("我们中出了一个叛徒👪", true);
937 assert_eq!(words, vec!["我们", "中出", "了", "一个", "叛徒", "👪"]);
938
939 let words = jieba.cut("我来到北京清华大学", true);
940 assert_eq!(words, vec!["我", "来到", "北京", "清华大学"]);
941
942 let words = jieba.cut("他来到了网易杭研大厦", true);
943 assert_eq!(words, vec!["他", "来到", "了", "网易", "杭研", "大厦"]);
944 }
945
946 #[test]
947 fn test_cut_weicheng() {
948 static WEICHENG_TXT: &str = include_str!("../examples/weicheng/src/weicheng.txt");
949 let jieba = Jieba::new();
950 for line in WEICHENG_TXT.split('\n') {
951 let _ = jieba.cut(line, true);
952 }
953 }
954
955 #[test]
956 fn test_cut_for_search() {
957 let jieba = Jieba::new();
958 let words = jieba.cut_for_search("南京市长江大桥", true);
959 assert_eq!(words, vec!["南京", "京市", "南京市", "长江", "大桥", "长江大桥"]);
960
961 let words = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true);
962
963 assert_eq!(
966 words,
967 vec![
968 "小明",
969 "硕士",
970 "毕业",
971 "于",
972 "中国",
973 "科学",
974 "学院",
975 "科学院",
976 "中国科学院",
977 "计算",
978 "计算所",
979 ",",
980 "后",
981 "在",
982 "日本",
983 "京都",
984 "大学",
985 "日本京都大学",
986 "深造",
987 ]
988 );
989 }
990
991 #[test]
992 fn test_tag() {
993 let jieba = Jieba::new();
994 let tags = jieba.tag(
995 "我是拖拉机学院手扶拖拉机专业的。不用多久,我就会升职加薪,当上CEO,走上人生巅峰。",
996 true,
997 );
998 assert_eq!(
999 tags,
1000 vec![
1001 Tag { word: "我", tag: "r" },
1002 Tag { word: "是", tag: "v" },
1003 Tag {
1004 word: "拖拉机",
1005 tag: "n",
1006 },
1007 Tag {
1008 word: "学院",
1009 tag: "n",
1010 },
1011 Tag {
1012 word: "手扶拖拉机",
1013 tag: "n",
1014 },
1015 Tag {
1016 word: "专业",
1017 tag: "n",
1018 },
1019 Tag { word: "的", tag: "uj" },
1020 Tag { word: "。", tag: "x" },
1021 Tag {
1022 word: "不用",
1023 tag: "v",
1024 },
1025 Tag {
1026 word: "多久",
1027 tag: "m",
1028 },
1029 Tag { word: ",", tag: "x" },
1030 Tag { word: "我", tag: "r" },
1031 Tag { word: "就", tag: "d" },
1032 Tag { word: "会", tag: "v" },
1033 Tag {
1034 word: "升职",
1035 tag: "v",
1036 },
1037 Tag {
1038 word: "加薪",
1039 tag: "nr",
1040 },
1041 Tag { word: ",", tag: "x" },
1042 Tag {
1043 word: "当上",
1044 tag: "t",
1045 },
1046 Tag {
1047 word: "CEO",
1048 tag: "eng",
1049 },
1050 Tag { word: ",", tag: "x" },
1051 Tag {
1052 word: "走上",
1053 tag: "v",
1054 },
1055 Tag {
1056 word: "人生",
1057 tag: "n",
1058 },
1059 Tag {
1060 word: "巅峰",
1061 tag: "n",
1062 },
1063 Tag { word: "。", tag: "x" },
1064 ]
1065 );
1066
1067 let tags = jieba.tag("今天纽约的天气真好啊,京华大酒店的张尧经理吃了一只北京烤鸭。", true);
1068 assert_eq!(
1069 tags,
1070 vec![
1071 Tag {
1072 word: "今天",
1073 tag: "t",
1074 },
1075 Tag {
1076 word: "纽约",
1077 tag: "ns",
1078 },
1079 Tag { word: "的", tag: "uj" },
1080 Tag {
1081 word: "天气",
1082 tag: "n",
1083 },
1084 Tag {
1085 word: "真好",
1086 tag: "d",
1087 },
1088 Tag { word: "啊", tag: "zg" },
1089 Tag { word: ",", tag: "x" },
1090 Tag {
1091 word: "京华",
1092 tag: "nz",
1093 },
1094 Tag {
1095 word: "大酒店",
1096 tag: "n",
1097 },
1098 Tag { word: "的", tag: "uj" },
1099 Tag {
1100 word: "张尧",
1101 tag: "x",
1102 }, Tag {
1104 word: "经理",
1105 tag: "n",
1106 },
1107 Tag { word: "吃", tag: "v" },
1108 Tag { word: "了", tag: "ul" },
1109 Tag {
1110 word: "一只",
1111 tag: "m",
1112 },
1113 Tag {
1114 word: "北京烤鸭",
1115 tag: "n",
1116 },
1117 Tag { word: "。", tag: "x" },
1118 ]
1119 );
1120 }
1121
1122 #[test]
1123 fn test_tokenize() {
1124 let jieba = Jieba::new();
1125 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Default, false);
1126 assert_eq!(
1127 tokens,
1128 vec![
1129 Token {
1130 word: "南京市",
1131 start: 0,
1132 end: 3,
1133 },
1134 Token {
1135 word: "长江大桥",
1136 start: 3,
1137 end: 7,
1138 },
1139 ]
1140 );
1141
1142 let tokens = jieba.tokenize("南京市长江大桥", TokenizeMode::Search, false);
1143 assert_eq!(
1144 tokens,
1145 vec![
1146 Token {
1147 word: "南京",
1148 start: 0,
1149 end: 2,
1150 },
1151 Token {
1152 word: "京市",
1153 start: 1,
1154 end: 3,
1155 },
1156 Token {
1157 word: "南京市",
1158 start: 0,
1159 end: 3,
1160 },
1161 Token {
1162 word: "长江",
1163 start: 3,
1164 end: 5,
1165 },
1166 Token {
1167 word: "大桥",
1168 start: 5,
1169 end: 7,
1170 },
1171 Token {
1172 word: "长江大桥",
1173 start: 3,
1174 end: 7,
1175 },
1176 ]
1177 );
1178
1179 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1180 assert_eq!(
1181 tokens,
1182 vec![
1183 Token {
1184 word: "我们",
1185 start: 0,
1186 end: 2,
1187 },
1188 Token {
1189 word: "中",
1190 start: 2,
1191 end: 3,
1192 },
1193 Token {
1194 word: "出",
1195 start: 3,
1196 end: 4,
1197 },
1198 Token {
1199 word: "了",
1200 start: 4,
1201 end: 5,
1202 },
1203 Token {
1204 word: "一个",
1205 start: 5,
1206 end: 7,
1207 },
1208 Token {
1209 word: "叛徒",
1210 start: 7,
1211 end: 9,
1212 },
1213 ]
1214 );
1215 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1216 assert_eq!(
1217 tokens,
1218 vec![
1219 Token {
1220 word: "我们",
1221 start: 0,
1222 end: 2,
1223 },
1224 Token {
1225 word: "中出",
1226 start: 2,
1227 end: 4,
1228 },
1229 Token {
1230 word: "了",
1231 start: 4,
1232 end: 5,
1233 },
1234 Token {
1235 word: "一个",
1236 start: 5,
1237 end: 7,
1238 },
1239 Token {
1240 word: "叛徒",
1241 start: 7,
1242 end: 9,
1243 },
1244 ]
1245 );
1246
1247 let tokens = jieba.tokenize("永和服装饰品有限公司", TokenizeMode::Default, true);
1248 assert_eq!(
1249 tokens,
1250 vec![
1251 Token {
1252 word: "永和",
1253 start: 0,
1254 end: 2,
1255 },
1256 Token {
1257 word: "服装",
1258 start: 2,
1259 end: 4,
1260 },
1261 Token {
1262 word: "饰品",
1263 start: 4,
1264 end: 6,
1265 },
1266 Token {
1267 word: "有限公司",
1268 start: 6,
1269 end: 10,
1270 },
1271 ]
1272 );
1273 }
1274
1275 #[test]
1276 fn test_userdict() {
1277 let mut jieba = Jieba::new();
1278 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1279 assert_eq!(
1280 tokens,
1281 vec![
1282 Token {
1283 word: "我们",
1284 start: 0,
1285 end: 2,
1286 },
1287 Token {
1288 word: "中",
1289 start: 2,
1290 end: 3,
1291 },
1292 Token {
1293 word: "出",
1294 start: 3,
1295 end: 4,
1296 },
1297 Token {
1298 word: "了",
1299 start: 4,
1300 end: 5,
1301 },
1302 Token {
1303 word: "一个",
1304 start: 5,
1305 end: 7,
1306 },
1307 Token {
1308 word: "叛徒",
1309 start: 7,
1310 end: 9,
1311 },
1312 ]
1313 );
1314 let userdict = "中出 10000";
1315 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1316 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, false);
1317 assert_eq!(
1318 tokens,
1319 vec![
1320 Token {
1321 word: "我们",
1322 start: 0,
1323 end: 2,
1324 },
1325 Token {
1326 word: "中出",
1327 start: 2,
1328 end: 4,
1329 },
1330 Token {
1331 word: "了",
1332 start: 4,
1333 end: 5,
1334 },
1335 Token {
1336 word: "一个",
1337 start: 5,
1338 end: 7,
1339 },
1340 Token {
1341 word: "叛徒",
1342 start: 7,
1343 end: 9,
1344 },
1345 ]
1346 );
1347 }
1348
1349 #[test]
1350 fn test_userdict_hmm() {
1351 let mut jieba = Jieba::new();
1352 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1353 assert_eq!(
1354 tokens,
1355 vec![
1356 Token {
1357 word: "我们",
1358 start: 0,
1359 end: 2,
1360 },
1361 Token {
1362 word: "中出",
1363 start: 2,
1364 end: 4,
1365 },
1366 Token {
1367 word: "了",
1368 start: 4,
1369 end: 5,
1370 },
1371 Token {
1372 word: "一个",
1373 start: 5,
1374 end: 7,
1375 },
1376 Token {
1377 word: "叛徒",
1378 start: 7,
1379 end: 9,
1380 },
1381 ]
1382 );
1383 let userdict = "出了 10000";
1384 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1385 let tokens = jieba.tokenize("我们中出了一个叛徒", TokenizeMode::Default, true);
1386 assert_eq!(
1387 tokens,
1388 vec![
1389 Token {
1390 word: "我们",
1391 start: 0,
1392 end: 2,
1393 },
1394 Token {
1395 word: "中",
1396 start: 2,
1397 end: 3,
1398 },
1399 Token {
1400 word: "出了",
1401 start: 3,
1402 end: 5,
1403 },
1404 Token {
1405 word: "一个",
1406 start: 5,
1407 end: 7,
1408 },
1409 Token {
1410 word: "叛徒",
1411 start: 7,
1412 end: 9,
1413 },
1414 ]
1415 );
1416 }
1417
1418 #[test]
1419 fn test_userdict_error() {
1420 let mut jieba = Jieba::empty();
1421 let userdict = "出了 not_a_int";
1422 let ret = jieba.load_dict(&mut BufReader::new(userdict.as_bytes()));
1423 assert!(ret.is_err());
1424 }
1425
1426 #[test]
1427 fn test_suggest_freq() {
1428 let mut jieba = Jieba::new();
1431 assert_eq!(jieba.suggest_freq("中出"), 348);
1433 assert_eq!(jieba.suggest_freq("出了"), 1263);
1434
1435 let userdict = "中出 300";
1437 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1438 assert_eq!(jieba.suggest_freq("中出"), 348);
1440
1441 let userdict = "中出 500";
1442 jieba.load_dict(&mut BufReader::new(userdict.as_bytes())).unwrap();
1443 assert_eq!(jieba.suggest_freq("中出"), 500)
1445 }
1446
1447 #[test]
1448 fn test_custom_lower_freq() {
1449 let mut jieba = Jieba::new();
1450
1451 jieba.add_word("测试", Some(2445), None);
1452 jieba.add_word("测试", Some(10), None);
1453 let words = jieba.cut("测试", false);
1454 assert_eq!(words, vec!["测试"]);
1455 }
1456
1457 #[test]
1458 fn test_cut_dag_no_hmm_against_string_with_sip() {
1459 let mut jieba = Jieba::empty();
1460
1461 jieba.add_word("䶴䶵𦡦", Some(1000), None);
1463 jieba.add_word("讥䶯䶰䶱䶲䶳", Some(1000), None);
1464
1465 let words = jieba.cut("讥䶯䶰䶱䶲䶳䶴䶵𦡦", false);
1466 assert_eq!(words, vec!["讥䶯䶰䶱䶲䶳", "䶴䶵𦡦"]);
1467 }
1468
1469 #[test]
1470 fn test_add_custom_word_with_underscrore() {
1471 let mut jieba = Jieba::empty();
1472 jieba.add_word("田-女士", Some(42), Some("n"));
1473 let words = jieba.cut("市民田-女士急匆匆", false);
1474 assert_eq!(words, vec!["市", "民", "田-女士", "急", "匆", "匆"]);
1475 }
1476}