pub fn porter_stem(word: &str) -> Result<String, &'static str> {
if !word.is_ascii() {
return Err("Porter Stemmer only accepts ASCII words");
}
if word.len() < 3 { return Ok(word.to_string());
}
let bytes = &word.to_ascii_lowercase().into_bytes ();
let mut data = PorterData::new(bytes.to_vec());
data.step_1ab();
if data.k > 0 {
data.step_1c();
data.step_2();
data.step_3();
data.step_4();
data.step_5();
}
if let Ok(word) = String::from_utf8(data.b[..=data.k].to_vec()) {
Ok(word)
} else {
Err("Error in decoding final word")
}
}
struct PorterData {
b: Vec<u8>, j: usize, k: usize, }
impl PorterData {
fn new (b: Vec<u8>) -> PorterData {
let k = b.len()-1;
PorterData {b, j : 0, k}
}
fn double_consonant(&self, i: usize) -> bool {
i > 0 && self.b[i] == self.b[i-1] && self.is_consonant(i)
}
fn ends_with (&mut self, suf: &str) -> bool {
if suf.len() <= self.k &&
&self.b[(self.k-suf.len()+1)..self.k+1] == suf.as_bytes() {
self.j = self.k-suf.len();
true
} else {
false
}
}
fn is_consonant(&self, i: usize) -> bool {
match self.b[i] {
b'a' | b'e' | b'i' | b'o' | b'u' => false,
b'y' => {
i == 0 || !self.is_consonant(i-1)
},
_ => true
}
}
fn is_cvc(&self, i: usize) -> bool {
if i < 2 ||
!self.is_consonant(i) ||
self.is_consonant(i-1) ||
!self.is_consonant(i-2) {
return false;
}
let ch = self.b[i];
ch != b'w' && ch != b'x' && ch != b'y'
}
fn make_replacements(&mut self, letter: u8, replacements: &[(u8, &[(&str, &str)])]) {
for (key, rules) in replacements.iter() {
if letter == *key {
for (suffix, replacement) in rules.iter() {
if self.ends_with(suffix) {
self.r_set_to(replacement);
}
}
break;
}
}
}
fn r_set_to(&mut self, suf: &str) {
if self.seq_count() > 0 {
self.set_to(suf);
}
}
fn seq_count(&self) -> usize {
let mut count = 0;
let mut i = 0;
while i <= self.j && self.is_consonant(i) {
i += 1;
}
while i <= self.j {
while i <= self.j && !self.is_consonant(i) {
i += 1;
}
if i <= self.j {
count += 1;
}
while i <= self.j && self.is_consonant(i) {
i += 1;
}
}
count
}
fn set_to(&mut self, suf: &str) {
suf.as_bytes()
.iter()
.enumerate()
.for_each(|(i, c)| self.b[self.j+i+1] = *c);
self.k = self.j + suf.len();
}
fn vowel_in_stem(&self) -> bool {
(0..=self.j).any(|i| !self.is_consonant(i))
}
fn step_1ab (&mut self) {
if self.b[self.k] == b's' {
if self.ends_with("sses") {
self.k -= 2;
} else if self.ends_with("ies") {
self.set_to("i");
} else if self.b[self.k-1] != b's' {
self.k -= 1;
}
}
if self.ends_with("eed") {
if self.seq_count() > 0 {
self.k -= 1;
}
} else if (self.ends_with("ed") || self.ends_with("ing")) && self.vowel_in_stem() {
self.k = self.j;
if self.ends_with("at") {
self.set_to("ate");
} else if self.ends_with("bl") {
self.set_to("ble");
} else if self.ends_with("iz") {
self.set_to("ize");
} else if self.double_consonant(self.k) {
self.k -= 1;
let ch = self.b[self.k];
if ch == b'l' || ch == b's' || ch == b'z' {
self.k += 1;
}
} else if self.seq_count() == 1 && self.is_cvc(self.k) {
self.set_to("e");
}
}
}
fn step_1c (&mut self) {
if self.ends_with("y") && self.vowel_in_stem() {
self.b[self.k] = b'i';
}
}
fn step_2 (&mut self) {
self.make_replacements(self.b[self.k-1], &STEP_2_RULES);
}
fn step_3 (&mut self) {
self.make_replacements(self.b[self.k], &STEP_3_RULES);
}
fn step_4 (&mut self) {
match self.b[self.k-1] { b'a' => {
if !self.ends_with("al") { return; }
},
b'c' => {
if !self.ends_with("ance") && !self.ends_with("ence") { return; }
},
b'e' => {
if !self.ends_with("er") { return; }
},
b'i' => {
if !self.ends_with("ic") { return; }
},
b'l' => {
if !self.ends_with("able") && !self.ends_with("ible") { return; }
},
b'n' => {
if !self.ends_with("ant") && !self.ends_with("ement") &&
!self.ends_with("ment") && !self.ends_with("ent") { return; }
},
b'o' => {
if self.ends_with("ion") &&
(self.b[self.j] == b's' || self.b[self.j] == b't') {
} else if !self.ends_with("ou") {
return;
}
},
b's' => {
if !self.ends_with("ism") { return; }
},
b't' => {
if !self.ends_with("ate") && !self.ends_with("iti") { return; }
},
b'u' => {
if !self.ends_with("ous") { return; }
},
b'v' => {
if !self.ends_with("ive") { return; }
},
b'z' => {
if !self.ends_with("ize") { return; }
},
_ => return,
}
if self.seq_count() > 1 {
self.k = self.j;
}
}
fn step_5 (&mut self) {
self.j = self.k;
if self.b[self.k] == b'e' {
let count = self.seq_count();
if count > 1 || (count == 1 && !self.is_cvc(self.k-1)) {
self.k -= 1;
}
}
if self.b[self.k] == b'l' && self.double_consonant(self.k) && self.seq_count() > 1 {
self.k -= 1;
}
}
}
const STEP_2_RULES: &'static [(u8, &'static [(&'static str, &'static str)])] = &[
(b'a', &[("ational", "ate"), ("tional", "tion")]),
(b'c', &[("enci", "ence"), ("anci", "ance")]),
(b'e', &[("izer", "ize")]),
(b'g', &[("logi", "log")]),
(b'l', &[("bli", "ble"), ("alli", "al"), ("entli", "ent"), ("eli", "e"), ("ousli", "ous")]),
(b'o', &[("ization", "ize"), ("ation", "ate"), ("ator", "ate")]),
(b's', &[("alism", "al"), ("iveness", "ive"), ("fulness", "ful"), ("ousness", "ous")]),
(b't', &[("aliti", "al"), ("iviti", "ive"), ("biliti", "ble")]),
];
const STEP_3_RULES: &'static [(u8, &'static [(&'static str, &'static str)])] = &[
(b'e', &[("icate", "ic"), ("ative", ""), ("alize", "al")]),
(b'i', &[("iciti", "ic")]),
(b'l', &[("ical", "ic"), ("ful", "")]),
(b's', &[("ness", "")]),
];