jieba_rs/keywords/
mod.rs

1use crate::Jieba;
2use derive_builder::Builder;
3use std::collections::BTreeSet;
4use std::sync::LazyLock;
5
6#[cfg(feature = "textrank")]
7pub mod textrank;
8#[cfg(feature = "tfidf")]
9pub mod tfidf;
10
11pub static DEFAULT_STOP_WORDS: LazyLock<BTreeSet<String>> = LazyLock::new(|| {
12    BTreeSet::from_iter(
13        [
14            "the", "of", "is", "and", "to", "in", "that", "we", "for", "an", "are", "by", "be", "as", "on", "with",
15            "can", "if", "from", "which", "you", "it", "this", "then", "at", "have", "all", "not", "one", "has", "or",
16            "that",
17        ]
18        .into_iter()
19        .map(|s| s.to_string()),
20    )
21});
22
23/// Keyword with weight
24#[derive(Debug, Clone, PartialEq)]
25pub struct Keyword {
26    pub keyword: String,
27    pub weight: f64,
28}
29
30/// Creates a KeywordExtractConfig state that contains filter criteria as
31/// well as segmentation configuration for use by keyword extraction
32/// implementations.
33///
34/// Use KeywordExtractConfigBuilder to change the defaults.
35///
36/// # Examples
37/// ```
38///    use jieba_rs::KeywordExtractConfig;
39///
40///    let mut config = KeywordExtractConfig::default();
41///    assert!(config.stop_words().contains("the"));
42///    assert!(!config.stop_words().contains("FakeWord"));
43///    assert!(!config.use_hmm());
44///    assert_eq!(2, config.min_keyword_length());
45///
46///    let built_default = KeywordExtractConfig::builder().build().unwrap();
47///    assert_eq!(config, built_default);
48///
49///    let changed = KeywordExtractConfig::builder()
50///        .add_stop_word("FakeWord".to_string())
51///        .remove_stop_word("the")
52///        .use_hmm(true)
53///        .min_keyword_length(10)
54///        .build().unwrap();
55///
56///    assert!(!changed.stop_words().contains("the"));
57///    assert!(changed.stop_words().contains("FakeWord"));
58///    assert!(changed.use_hmm());
59///    assert_eq!(10, changed.min_keyword_length());
60/// ```
61#[derive(Builder, Debug, Clone, PartialEq)]
62pub struct KeywordExtractConfig {
63    #[builder(default = "self.default_stop_words()?", setter(custom))]
64    stop_words: BTreeSet<String>,
65
66    #[builder(default = "2")]
67    #[doc = r"Any segments less than this length will not be considered a Keyword"]
68    min_keyword_length: usize,
69
70    #[builder(default = "false")]
71    #[doc = r"If true, fall back to hmm model if segment cannot be found in the dictionary"]
72    use_hmm: bool,
73}
74
75impl KeywordExtractConfig {
76    pub fn builder() -> KeywordExtractConfigBuilder {
77        KeywordExtractConfigBuilder::default()
78    }
79
80    /// Get current set of stop words.
81    pub fn stop_words(&self) -> &BTreeSet<String> {
82        &self.stop_words
83    }
84
85    /// True if hmm is used during segmentation in `extract_tags`.
86    pub fn use_hmm(&self) -> bool {
87        self.use_hmm
88    }
89
90    /// Gets the minimum number of Unicode Scalar Values required per keyword.
91    pub fn min_keyword_length(&self) -> usize {
92        self.min_keyword_length
93    }
94
95    #[inline]
96    pub(crate) fn filter(&self, s: &str) -> bool {
97        s.chars().count() >= self.min_keyword_length() && !self.stop_words.contains(&s.to_lowercase())
98    }
99}
100
101impl KeywordExtractConfigBuilder {
102    fn default_stop_words(&self) -> Result<BTreeSet<String>, KeywordExtractConfigBuilderError> {
103        Ok(DEFAULT_STOP_WORDS.clone())
104    }
105
106    /// Add a new stop word.
107    ///
108    /// # Examples
109    /// ```
110    ///    use jieba_rs::KeywordExtractConfig;
111    ///    use std::collections::BTreeSet;
112    ///
113    ///    let populates_default = KeywordExtractConfig::builder()
114    ///        .add_stop_word("FakeWord".to_string())
115    ///        .build().unwrap();
116    ///
117    ///    assert!(populates_default.stop_words().contains("the"));
118    ///    assert!(populates_default.stop_words().contains("FakeWord"));
119    ///
120    ///    let multiple_adds_stack = KeywordExtractConfig::builder()
121    ///        .add_stop_word("FakeWord".to_string())
122    ///        .add_stop_word("MoarFakeWord".to_string())
123    ///        .build().unwrap();
124    ///
125    ///    assert!(multiple_adds_stack.stop_words().contains("the"));
126    ///    assert!(multiple_adds_stack.stop_words().contains("FakeWord"));
127    ///    assert!(multiple_adds_stack.stop_words().contains("MoarFakeWord"));
128    ///
129    ///    let no_default_if_set = KeywordExtractConfig::builder()
130    ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
131    ///        .add_stop_word("FakeWord".to_string())
132    ///        .build().unwrap();
133    ///
134    ///    assert!(!no_default_if_set.stop_words().contains("the"));
135    ///    assert!(no_default_if_set.stop_words().contains("boo"));
136    ///    assert!(no_default_if_set.stop_words().contains("FakeWord"));
137    /// ```
138    pub fn add_stop_word(&mut self, word: String) -> &mut Self {
139        if self.stop_words.is_none() {
140            self.stop_words = Some(self.default_stop_words().unwrap());
141        }
142        self.stop_words.as_mut().unwrap().insert(word);
143        self
144    }
145
146    /// Remove an existing stop word.
147    ///
148    /// # Examples
149    /// ```
150    ///    use jieba_rs::KeywordExtractConfig;
151    ///    use std::collections::BTreeSet;
152    ///
153    ///    let populates_default = KeywordExtractConfig::builder()
154    ///        .remove_stop_word("the")
155    ///        .build().unwrap();
156    ///
157    ///    assert!(!populates_default.stop_words().contains("the"));
158    ///    assert!(populates_default.stop_words().contains("of"));
159    ///
160    ///    let no_default_if_set = KeywordExtractConfig::builder()
161    ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
162    ///         // Removing non-existant word is okay.
163    ///        .remove_stop_word("the".to_string())
164    ///        .build().unwrap();
165    ///
166    ///    assert!(!no_default_if_set.stop_words().contains("the"));
167    ///    assert!(!no_default_if_set.stop_words().contains("of"));
168    ///    assert!(no_default_if_set.stop_words().contains("boo"));
169    /// ```
170    pub fn remove_stop_word(&mut self, word: impl AsRef<str>) -> &mut Self {
171        if self.stop_words.is_none() {
172            self.stop_words = Some(self.default_stop_words().unwrap());
173        }
174        self.stop_words.as_mut().unwrap().remove(word.as_ref());
175        self
176    }
177
178    /// Replace all stop words with new stop words set.
179    ///
180    /// # Examples
181    /// ```
182    ///    use jieba_rs::KeywordExtractConfig;
183    ///    use std::collections::BTreeSet;
184    ///
185    ///    let no_default_if_set = KeywordExtractConfig::builder()
186    ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
187    ///        .build().unwrap();
188    ///
189    ///    assert!(!no_default_if_set.stop_words().contains("the"));
190    ///    assert!(no_default_if_set.stop_words().contains("boo"));
191    ///
192    ///    let overwrites = KeywordExtractConfig::builder()
193    ///        .add_stop_word("FakeWord".to_string())
194    ///        .set_stop_words(BTreeSet::from(["boo".to_string()]))
195    ///        .build().unwrap();
196    ///
197    ///    assert!(!no_default_if_set.stop_words().contains("FakeWord"));
198    ///    assert!(no_default_if_set.stop_words().contains("boo"));
199    /// ```
200    pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) -> &mut Self {
201        self.stop_words = Some(stop_words);
202        self
203    }
204}
205
206impl Default for KeywordExtractConfig {
207    fn default() -> KeywordExtractConfig {
208        KeywordExtractConfigBuilder::default().build().unwrap()
209    }
210}
211
212/// Extracts keywords from a given sentence with the Jieba instance.
213pub trait KeywordExtract {
214    fn extract_keywords(&self, jieba: &Jieba, sentence: &str, top_k: usize, allowed_pos: Vec<String>) -> Vec<Keyword>;
215}