jieba_rs/keywords/mod.rs
1use crate::Jieba;
2use derive_builder::Builder;
3use std::collections::BTreeSet;
4use std::sync::LazyLock;
5
6#[cfg(feature = "textrank")]
7pub mod textrank;
8#[cfg(feature = "tfidf")]
9pub mod tfidf;
10
11pub static DEFAULT_STOP_WORDS: LazyLock<BTreeSet<String>> = LazyLock::new(|| {
12 BTreeSet::from_iter(
13 [
14 "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with",
15 "can", "if", "from", "which", "you", "it", "this", "then", "at", "have", "all", "not", "one", "has", "or",
16 "that",
17 ]
18 .into_iter()
19 .map(|s| s.to_string()),
20 )
21});
22
23/// Keyword with weight
24#[derive(Debug, Clone, PartialEq)]
25pub struct Keyword {
26 pub keyword: String,
27 pub weight: f64,
28}
29
30/// Creates a KeywordExtractConfig state that contains filter criteria as
31/// well as segmentation configuration for use by keyword extraction
32/// implementations.
33///
34/// Use KeywordExtractConfigBuilder to change the defaults.
35///
36/// # Examples
37/// ```
38/// use jieba_rs::KeywordExtractConfig;
39///
40/// let mut config = KeywordExtractConfig::default();
41/// assert!(config.stop_words().contains("the"));
42/// assert!(!config.stop_words().contains("FakeWord"));
43/// assert!(!config.use_hmm());
44/// assert_eq!(2, config.min_keyword_length());
45///
46/// let built_default = KeywordExtractConfig::builder().build().unwrap();
47/// assert_eq!(config, built_default);
48///
49/// let changed = KeywordExtractConfig::builder()
50/// .add_stop_word("FakeWord".to_string())
51/// .remove_stop_word("the")
52/// .use_hmm(true)
53/// .min_keyword_length(10)
54/// .build().unwrap();
55///
56/// assert!(!changed.stop_words().contains("the"));
57/// assert!(changed.stop_words().contains("FakeWord"));
58/// assert!(changed.use_hmm());
59/// assert_eq!(10, changed.min_keyword_length());
60/// ```
61#[derive(Builder, Debug, Clone, PartialEq)]
62pub struct KeywordExtractConfig {
63 #[builder(default = "self.default_stop_words()?", setter(custom))]
64 stop_words: BTreeSet<String>,
65
66 #[builder(default = "2")]
67 #[doc = r"Any segments less than this length will not be considered a Keyword"]
68 min_keyword_length: usize,
69
70 #[builder(default = "false")]
71 #[doc = r"If true, fall back to hmm model if segment cannot be found in the dictionary"]
72 use_hmm: bool,
73}
74
75impl KeywordExtractConfig {
76 pub fn builder() -> KeywordExtractConfigBuilder {
77 KeywordExtractConfigBuilder::default()
78 }
79
80 /// Get current set of stop words.
81 pub fn stop_words(&self) -> &BTreeSet<String> {
82 &self.stop_words
83 }
84
85 /// True if hmm is used during segmentation in `extract_tags`.
86 pub fn use_hmm(&self) -> bool {
87 self.use_hmm
88 }
89
90 /// Gets the minimum number of Unicode Scalar Values required per keyword.
91 pub fn min_keyword_length(&self) -> usize {
92 self.min_keyword_length
93 }
94
95 #[inline]
96 pub(crate) fn filter(&self, s: &str) -> bool {
97 s.chars().count() >= self.min_keyword_length() && !self.stop_words.contains(&s.to_lowercase())
98 }
99}
100
101impl KeywordExtractConfigBuilder {
102 fn default_stop_words(&self) -> Result<BTreeSet<String>, KeywordExtractConfigBuilderError> {
103 Ok(DEFAULT_STOP_WORDS.clone())
104 }
105
106 /// Add a new stop word.
107 ///
108 /// # Examples
109 /// ```
110 /// use jieba_rs::KeywordExtractConfig;
111 /// use std::collections::BTreeSet;
112 ///
113 /// let populates_default = KeywordExtractConfig::builder()
114 /// .add_stop_word("FakeWord".to_string())
115 /// .build().unwrap();
116 ///
117 /// assert!(populates_default.stop_words().contains("the"));
118 /// assert!(populates_default.stop_words().contains("FakeWord"));
119 ///
120 /// let multiple_adds_stack = KeywordExtractConfig::builder()
121 /// .add_stop_word("FakeWord".to_string())
122 /// .add_stop_word("MoarFakeWord".to_string())
123 /// .build().unwrap();
124 ///
125 /// assert!(multiple_adds_stack.stop_words().contains("the"));
126 /// assert!(multiple_adds_stack.stop_words().contains("FakeWord"));
127 /// assert!(multiple_adds_stack.stop_words().contains("MoarFakeWord"));
128 ///
129 /// let no_default_if_set = KeywordExtractConfig::builder()
130 /// .set_stop_words(BTreeSet::from(["boo".to_string()]))
131 /// .add_stop_word("FakeWord".to_string())
132 /// .build().unwrap();
133 ///
134 /// assert!(!no_default_if_set.stop_words().contains("the"));
135 /// assert!(no_default_if_set.stop_words().contains("boo"));
136 /// assert!(no_default_if_set.stop_words().contains("FakeWord"));
137 /// ```
138 pub fn add_stop_word(&mut self, word: String) -> &mut Self {
139 if self.stop_words.is_none() {
140 self.stop_words = Some(self.default_stop_words().unwrap());
141 }
142 self.stop_words.as_mut().unwrap().insert(word);
143 self
144 }
145
146 /// Remove an existing stop word.
147 ///
148 /// # Examples
149 /// ```
150 /// use jieba_rs::KeywordExtractConfig;
151 /// use std::collections::BTreeSet;
152 ///
153 /// let populates_default = KeywordExtractConfig::builder()
154 /// .remove_stop_word("the")
155 /// .build().unwrap();
156 ///
157 /// assert!(!populates_default.stop_words().contains("the"));
158 /// assert!(populates_default.stop_words().contains("of"));
159 ///
160 /// let no_default_if_set = KeywordExtractConfig::builder()
161 /// .set_stop_words(BTreeSet::from(["boo".to_string()]))
162 /// // Removing non-existant word is okay.
163 /// .remove_stop_word("the".to_string())
164 /// .build().unwrap();
165 ///
166 /// assert!(!no_default_if_set.stop_words().contains("the"));
167 /// assert!(!no_default_if_set.stop_words().contains("of"));
168 /// assert!(no_default_if_set.stop_words().contains("boo"));
169 /// ```
170 pub fn remove_stop_word(&mut self, word: impl AsRef<str>) -> &mut Self {
171 if self.stop_words.is_none() {
172 self.stop_words = Some(self.default_stop_words().unwrap());
173 }
174 self.stop_words.as_mut().unwrap().remove(word.as_ref());
175 self
176 }
177
178 /// Replace all stop words with new stop words set.
179 ///
180 /// # Examples
181 /// ```
182 /// use jieba_rs::KeywordExtractConfig;
183 /// use std::collections::BTreeSet;
184 ///
185 /// let no_default_if_set = KeywordExtractConfig::builder()
186 /// .set_stop_words(BTreeSet::from(["boo".to_string()]))
187 /// .build().unwrap();
188 ///
189 /// assert!(!no_default_if_set.stop_words().contains("the"));
190 /// assert!(no_default_if_set.stop_words().contains("boo"));
191 ///
192 /// let overwrites = KeywordExtractConfig::builder()
193 /// .add_stop_word("FakeWord".to_string())
194 /// .set_stop_words(BTreeSet::from(["boo".to_string()]))
195 /// .build().unwrap();
196 ///
197 /// assert!(!no_default_if_set.stop_words().contains("FakeWord"));
198 /// assert!(no_default_if_set.stop_words().contains("boo"));
199 /// ```
200 pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) -> &mut Self {
201 self.stop_words = Some(stop_words);
202 self
203 }
204}
205
206impl Default for KeywordExtractConfig {
207 fn default() -> KeywordExtractConfig {
208 KeywordExtractConfigBuilder::default().build().unwrap()
209 }
210}
211
212/// Extracts keywords from a given sentence with the Jieba instance.
213pub trait KeywordExtract {
214 fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword>;
215}