gecliht 0.2.0 - Docs.rs

//! Stemming algorithms are used to remove common endings from words 
//! to return a 'stem'. 
//!
//! Stemmers are popular in data mining/information retrieval applications
//! as they can make it easier to match texts by reducing the number of 
//! different word forms.





/// Porter stemming is a popular stemming algorithm used for English.
///
/// For a complete description, see: <https://tartarus.org/martin/PorterStemmer/>
///
/// Algorithm published in Porter, 1980, "An algorithm for suffix stripping," 
/// _Program_, Vol. 14, no. 3, pp 130-137.
///
/// # Examples
/// 
/// ```
/// assert_eq!(gecliht::porter_stem("industrialization").unwrap(), "industri");
/// assert_eq!(gecliht::porter_stem("word").unwrap(), "word");
/// assert_eq!(gecliht::porter_stem("ভুল"), Err("Porter Stemmer only accepts ASCII words"));
/// ```
///
/// # Errors
///
/// An error is raised if the word does not consist entirely of ASCII characters.
/// This is because the Porter stemmer is designed for stemming English words. 
///
pub fn porter_stem(word: &str) -> Result<String, &'static str> {

    if !word.is_ascii() {
        return Err("Porter Stemmer only accepts ASCII words");
    }
    if word.len() < 3 { // do not process short words
        return Ok(word.to_string());
    }
    // create the data structure
    let bytes = &word.to_ascii_lowercase().into_bytes ();
    let mut data = PorterData::new(bytes.to_vec());

    data.step_1ab();
    if data.k > 0 {
        data.step_1c();
        data.step_2();
        data.step_3();
        data.step_4();
        data.step_5();
    }

    // check we have a valid string at the end
    if let Ok(word) = String::from_utf8(data.b[..=data.k].to_vec()) {
        Ok(word)
    } else {
        Err("Error in decoding final word")
    }
}

struct PorterData {
    b: Vec<u8>, // array of chars representing the word we are stemming
    j: usize,   // an alternate 'end point' in the array, used, e.g., after searching to mark where a suffix begins
    k: usize,   // the end of the array
}

impl PorterData {

    fn new (b: Vec<u8>) -> PorterData {
        let k = b.len()-1;
        PorterData {b, j : 0, k}
    }

    // checks if there is a double consonant at position i,
    // with i being the second consonant's index
    fn double_consonant(&self, i: usize) -> bool { 
        i > 0 && self.b[i] == self.b[i-1] && self.is_consonant(i)
    }

    // suf is a suffix, and this fn returns true if the suffix ends the 
    // current set of bytes - k holds the last index in bytes.
    // sets j to the end point, if the suffix was removed
    fn ends_with (&mut self, suf: &str) -> bool  {
        if suf.len() <= self.k && 
            &self.b[(self.k-suf.len()+1)..self.k+1] == suf.as_bytes() {
                self.j = self.k-suf.len();
                true
            } else {
                false
            }
    }

    // checks if there is a consonant at position i
    // -- note special handling of 'y', which is treated
    // as a consonant at start of word or after a vowel.
    fn is_consonant(&self, i: usize) -> bool {
        match self.b[i] {
            b'a' | b'e' | b'i' | b'o' | b'u' => false,
            b'y' => {
                i == 0 || !self.is_consonant(i-1)
            },
            _ => true
        }
    }

    // cvc is true if i-2, i-1, i has form consonant, vowel, consonant
    // (and last must not be w, x, or y).
    fn is_cvc(&self, i: usize) -> bool {
        if i < 2 || 
            !self.is_consonant(i) || 
                self.is_consonant(i-1) || 
                !self.is_consonant(i-2) {
                    return false;
                }
        let ch = self.b[i];
        ch != b'w' && ch != b'x' && ch != b'y'
    }

    // Given a set of letter=>[(suffix, replacement)] pairs and a letter to match,
    // applies replacements for the given letter
    fn make_replacements(&mut self, letter: u8, replacements: &[(u8, &[(&str, &str)])]) {
        for (key, rules) in replacements.iter() {
            if letter == *key {
                for (suffix, replacement) in rules.iter() {
                    if self.ends_with(suffix) {
                        self.r_set_to(replacement);
                    }
                }
                break;
            }
        }
    }

    // restricted version of set_to
    // -- only calls set_to if seq_count is greater than 0
    // to prevent words becoming too small when suffix changed
    fn r_set_to(&mut self, suf: &str) {
        if self.seq_count() > 0 {
            self.set_to(suf);
        }
    }

    // counts the number of vowel-consonant sequences between 0 and j
    fn seq_count(&self) -> usize {
        let mut count = 0;
        let mut i = 0;
        // find index of first vowel, by skipping consonants
        while i <= self.j && self.is_consonant(i) {
            i += 1;
        }
        // repeat CV search to get full count until all of range is seen
        while i <= self.j { 
            // find index of next consonant, by skipping non-consonants
            while i <= self.j && !self.is_consonant(i) {
                i += 1;
            }
            // VC pair found with more to come, so increment count
            if i <= self.j { 
                count += 1;
            }
            // find index of next vowel, by skipping consonants
            while i <= self.j && self.is_consonant(i) {
                i += 1;
            }
        }

        count
    }

    // sets positions j+1 to k to the suffix, 
    // adjusting k to j+suf.len() (the new end of the word)
    fn set_to(&mut self, suf: &str) {
        suf.as_bytes()
            .iter()
            .enumerate()
            .for_each(|(i, c)| self.b[self.j+i+1] = *c);

        self.k = self.j + suf.len();
    }

    // looks for a vowel in range 0 to j in bytes
    fn vowel_in_stem(&self) -> bool {
        (0..=self.j).any(|i| !self.is_consonant(i))
    }

    // removes plurals and -ed -ing endings
    fn step_1ab (&mut self) {
        if self.b[self.k] == b's' {
            if self.ends_with("sses") {
                self.k -= 2;
            } else if self.ends_with("ies") {
                self.set_to("i");
            } else if self.b[self.k-1] != b's' {
                self.k -= 1;
            }
        }
        if self.ends_with("eed") {
            if self.seq_count() > 0 {
                self.k -= 1;
            } 
        } else if (self.ends_with("ed") || self.ends_with("ing")) && self.vowel_in_stem() {
            self.k = self.j;
            if self.ends_with("at") {
                self.set_to("ate");
            } else if self.ends_with("bl") {
                self.set_to("ble");
            } else if self.ends_with("iz") {
                self.set_to("ize");
            } else if self.double_consonant(self.k) {
                self.k -= 1;
                let ch = self.b[self.k];
                if ch == b'l' || ch == b's' || ch == b'z' {
                    self.k += 1;
                }
            } else if self.seq_count() == 1 && self.is_cvc(self.k) {
                self.set_to("e");
            }
        }
    }

    // turns terminal 'y' to 'i' when there is another vowel in stem
    fn step_1c (&mut self) {
        if self.ends_with("y") && self.vowel_in_stem() {
            self.b[self.k] = b'i';
        }
    }

    // double suffixes are mapped to single (shorter) ones
    fn step_2 (&mut self) {
        self.make_replacements(self.b[self.k-1], &STEP_2_RULES);
    }

    // deals with -ic -full -ness
    fn step_3 (&mut self) {
        self.make_replacements(self.b[self.k], &STEP_3_RULES);
    }

    // removes -ant -ence when seq_count is 2
    fn step_4 (&mut self) {
        match self.b[self.k-1] { // penultimate letter
            b'a' => {
                if !self.ends_with("al") { return; }
            },
            b'c' => {
                if !self.ends_with("ance") && !self.ends_with("ence") { return; }
            },
            b'e' => {
                if !self.ends_with("er") { return; }
            },
            b'i' => {
                if !self.ends_with("ic") { return; }
            },
            b'l' => {
                if !self.ends_with("able") && !self.ends_with("ible") { return; }
            },
            b'n' => {
                if !self.ends_with("ant") && !self.ends_with("ement") && 
                    !self.ends_with("ment") && !self.ends_with("ent") { return; }
            },
            b'o' => {
                if self.ends_with("ion") && 
                    (self.b[self.j] == b's' || self.b[self.j] == b't') {
                        // break;
                    } else if !self.ends_with("ou") { 
                        return; 
                    }
            },
            b's' => {
                if !self.ends_with("ism") { return; }
            },
            b't' => {
                if !self.ends_with("ate") && !self.ends_with("iti") { return; }
            },
            b'u' => {
                if !self.ends_with("ous") { return; }
            },
            b'v' => {
                if !self.ends_with("ive") { return; }
            },
            b'z' => {
                if !self.ends_with("ize") { return; }
            },
            _ => return,
        }
        if self.seq_count() > 1 {
            self.k = self.j;
        }
    }

    // if seq_count > 1, removes a final -e and changes -ll to -l
    fn step_5 (&mut self) {
        self.j = self.k;
        // remove final -e
        if self.b[self.k] == b'e' {
            let count = self.seq_count();
            if count > 1 || (count == 1 && !self.is_cvc(self.k-1)) {
                self.k -= 1;
            }
        }
        // -ll to -l
        if self.b[self.k] == b'l' && self.double_consonant(self.k) && self.seq_count() > 1 {
            self.k -= 1;
        }
    }
}

const STEP_2_RULES: &'static [(u8, &'static [(&'static str, &'static str)])] = &[
    (b'a', &[("ational", "ate"), ("tional", "tion")]),
    (b'c', &[("enci", "ence"), ("anci", "ance")]),
    (b'e', &[("izer", "ize")]),
    (b'g', &[("logi", "log")]),
    (b'l', &[("bli", "ble"), ("alli", "al"), ("entli", "ent"), ("eli", "e"), ("ousli", "ous")]),
    (b'o', &[("ization", "ize"), ("ation", "ate"), ("ator", "ate")]),
    (b's', &[("alism", "al"), ("iveness", "ive"), ("fulness", "ful"), ("ousness", "ous")]),
    (b't', &[("aliti", "al"), ("iviti", "ive"), ("biliti", "ble")]),
];

const STEP_3_RULES: &'static [(u8, &'static [(&'static str, &'static str)])] = &[
    (b'e', &[("icate", "ic"), ("ative", ""), ("alize", "al")]),
    (b'i', &[("iciti", "ic")]),
    (b'l', &[("ical", "ic"), ("ful", "")]),
    (b's', &[("ness", "")]),
];