yake_rust/
stopwords.rs

1use std::ops::{Deref, DerefMut};
2
3use crate::LTerm;
4
5/// Contains words to be filtered out from the resulting set.
6///
7/// The list is used to mark potentially meaningless tokens and generally based on the _language_
8/// given as input.
9///
10/// Tokens with fewer than three characters are also considered a stopword.
11#[derive(Debug, Default, Clone, Eq, PartialEq)]
12pub struct StopWords {
13    set: hashbrown::HashSet<LTerm>,
14}
15
16impl StopWords {
17    /// Use the passed set of lowercased strings as stopwords.
18    pub fn custom(lowercased: std::collections::HashSet<LTerm>) -> Self {
19        Self::from(lowercased)
20    }
21
22    /// Load a predefined list of stopwords for the language given as argument.
23    ///
24    /// The argument is a [ISO 639 two-letter code](https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes).
25    /// See the [isolang](https://docs.rs/isolang/latest/isolang/index.html) crate.
26    pub fn predefined(lang_iso_639_2: &str) -> Option<Self> {
27        // https://github.com/LIAAD/yake/tree/0fa58cceb465162b6bd0cab7ec967edeb907fbcc/yake/StopwordsList
28        // files were taken from the original repository, with extra modifications:
29        // - add extra line at the end
30        // - fix encoding, convert to utf8
31        // - switch from CRLF to LF
32        let file = match lang_iso_639_2 {
33            #[cfg(feature = "ar")]
34            "ar" => include_str!("stopwords/ar.txt"),
35            #[cfg(feature = "bg")]
36            "bg" => include_str!("stopwords/bg.txt"),
37            #[cfg(feature = "br")]
38            "br" => include_str!("stopwords/br.txt"),
39            #[cfg(feature = "cz")]
40            "cz" => include_str!("stopwords/cz.txt"),
41            #[cfg(feature = "da")]
42            "da" => include_str!("stopwords/da.txt"),
43            #[cfg(feature = "de")]
44            "de" => include_str!("stopwords/de.txt"),
45            #[cfg(feature = "el")]
46            "el" => include_str!("stopwords/el.txt"),
47            #[cfg(feature = "en")]
48            "en" => include_str!("stopwords/en.txt"),
49            #[cfg(feature = "es")]
50            "es" => include_str!("stopwords/es.txt"),
51            #[cfg(feature = "et")]
52            "et" => include_str!("stopwords/et.txt"),
53            #[cfg(feature = "fa")]
54            "fa" => include_str!("stopwords/fa.txt"),
55            #[cfg(feature = "fi")]
56            "fi" => include_str!("stopwords/fi.txt"),
57            #[cfg(feature = "fr")]
58            "fr" => include_str!("stopwords/fr.txt"),
59            #[cfg(feature = "hi")]
60            "hi" => include_str!("stopwords/hi.txt"),
61            #[cfg(feature = "hr")]
62            "hr" => include_str!("stopwords/hr.txt"),
63            #[cfg(feature = "hu")]
64            "hu" => include_str!("stopwords/hu.txt"),
65            #[cfg(feature = "hy")]
66            "hy" => include_str!("stopwords/hy.txt"),
67            #[cfg(feature = "id")]
68            "id" => include_str!("stopwords/id.txt"),
69            #[cfg(feature = "it")]
70            "it" => include_str!("stopwords/it.txt"),
71            #[cfg(feature = "ja")]
72            "ja" => include_str!("stopwords/ja.txt"),
73            #[cfg(feature = "lt")]
74            "lt" => include_str!("stopwords/lt.txt"),
75            #[cfg(feature = "lv")]
76            "lv" => include_str!("stopwords/lv.txt"),
77            #[cfg(feature = "nl")]
78            "nl" => include_str!("stopwords/nl.txt"),
79            #[cfg(feature = "no")]
80            "no" => include_str!("stopwords/no.txt"),
81            #[cfg(feature = "pl")]
82            "pl" => include_str!("stopwords/pl.txt"),
83            #[cfg(feature = "pt")]
84            "pt" => include_str!("stopwords/pt.txt"),
85            #[cfg(feature = "ro")]
86            "ro" => include_str!("stopwords/ro.txt"),
87            #[cfg(feature = "ru")]
88            "ru" => include_str!("stopwords/ru.txt"),
89            #[cfg(feature = "sk")]
90            "sk" => include_str!("stopwords/sk.txt"),
91            #[cfg(feature = "sl")]
92            "sl" => include_str!("stopwords/sl.txt"),
93            #[cfg(feature = "sv")]
94            "sv" => include_str!("stopwords/sv.txt"),
95            #[cfg(feature = "tr")]
96            "tr" => include_str!("stopwords/tr.txt"),
97            #[cfg(feature = "uk")]
98            "uk" => include_str!("stopwords/uk.txt"),
99            #[cfg(feature = "zh")]
100            "zh" => include_str!("stopwords/zh.txt"),
101            _ => return None,
102        };
103
104        Some(Self { set: file.lines().map(ToOwned::to_owned).collect() })
105    }
106}
107
108impl From<hashbrown::HashSet<LTerm>> for StopWords {
109    fn from(lowercased: hashbrown::HashSet<LTerm>) -> Self {
110        Self { set: lowercased.into_iter().collect() }
111    }
112}
113
114impl From<std::collections::HashSet<LTerm>> for StopWords {
115    fn from(lowercased: std::collections::HashSet<LTerm>) -> Self {
116        Self { set: lowercased.into_iter().collect() }
117    }
118}
119
120impl Deref for StopWords {
121    type Target = hashbrown::HashSet<LTerm>;
122
123    fn deref(&self) -> &Self::Target {
124        &self.set
125    }
126}
127
128impl<T> AsRef<T> for StopWords
129where
130    T: ?Sized,
131    <StopWords as Deref>::Target: AsRef<T>,
132{
133    fn as_ref(&self) -> &T {
134        self.deref().as_ref()
135    }
136}
137
138impl DerefMut for StopWords {
139    fn deref_mut(&mut self) -> &mut Self::Target {
140        &mut self.set
141    }
142}