rust-stemmers 1.2.0

A rust implementation of some popular snowball stemming algorithms
Documentation
use std::borrow::Cow;
use snowball::Among;

#[derive(Debug, Clone)]
pub struct SnowballEnv<'a> {
    pub current: Cow<'a, str>,
    pub cursor: usize,
    pub limit: usize,
    pub limit_backward: usize,
    pub bra: usize,
    pub ket: usize,
}


impl<'a> SnowballEnv<'a> {
    pub fn create(value: &'a str) -> Self {
        let len = value.len();
        SnowballEnv {
            current: Cow::from(value),
            cursor: 0,
            limit: len,
            limit_backward: 0,
            bra: 0,
            ket: len,
        }
    }

    pub fn get_current(self) -> Cow<'a, str> {
        self.current
    }

    fn replace_s(&mut self, bra: usize, ket: usize, s: &str) -> i32 {
        let adjustment = s.len() as i32 - (ket as i32 - bra as i32);
        let mut result = String::with_capacity(self.current.len());
        {
            let (lhs, _) = self.current.split_at(bra);
            let (_, rhs) = self.current.split_at(ket);
            result.push_str(lhs);
            result.push_str(s);
            result.push_str(rhs);
        }
        // ... not very nice...
        let new_lim = self.limit as i32 + adjustment;
        self.limit = new_lim as usize;
        if self.cursor >= ket {
            let new_cur = self.cursor as i32 + adjustment;
            self.cursor = new_cur as usize;
        } else if self.cursor > bra {
            self.cursor = bra
        }
        self.current = Cow::from(result);
        adjustment
    }

    /// Check if s is after cursor.
    /// If so, move cursor to the end of s
    pub fn eq_s(&mut self, s: &str) -> bool {
        if self.cursor >= self.limit {
            return false;
        }
        if self.current[self.cursor..].starts_with(s) {
            self.cursor += s.len();
            while !self.current.is_char_boundary(self.cursor) {
                self.cursor += 1;
            }
            true
        } else {
            false
        }
    }

    /// Check if 's' is before cursor
    /// If so, move cursor to the beginning of s
    pub fn eq_s_b(&mut self, s: &str) -> bool {
        if (self.cursor as i32 - self.limit_backward as i32) < s.len() as i32 {
            false
            // Check if cursor -s.len is a char boundry. if not well... return false obv
        } else if !self.current.is_char_boundary(self.cursor - s.len()) ||
                  !self.current[self.cursor - s.len()..].starts_with(s) {
            false
        } else {
            self.cursor -= s.len();
            true
        }
    }

    /// Replace string between `bra` and `ket` with s
    pub fn slice_from(&mut self, s: &str) -> bool {
        let (bra, ket) = (self.bra, self.ket);
        self.replace_s(bra, ket, s);
        true
    }

    /// Move cursor to next charater
    pub fn next_char(&mut self) {
        self.cursor += 1;
        while !self.current.is_char_boundary(self.cursor) {
            self.cursor += 1;
        }
    }

    /// Move cursor to previous character
    pub fn previous_char(&mut self) {
        self.cursor -= 1;
        while !self.current.is_char_boundary(self.cursor) {
            self.cursor -= 1;
        }
    }

    pub fn byte_index_for_hop(&self, mut delta: i32) -> i32 {
        if delta > 0 {
            let mut res = self.cursor;
            while delta > 0 {
                res += 1;
                delta -= 1;
                while res <= self.current.len() && !self.current.is_char_boundary(res) {
                    res += 1;
                }
            }
            return res as i32;
        } else if delta < 0 {
            let mut res: i32 = self.cursor as i32;
            while delta < 0 {
                res -= 1;
                delta += 1;
                while res >= 0 && !self.current.is_char_boundary(res as usize) {
                    res -= 1;
                }
            }
            return res as i32;
        } else {
            return self.cursor as i32;
        }
    }

    // A grouping is represented by a minimum code point, a maximum code point,
    // and a bitfield of which code points in that range are in the grouping.
    // For example, in english.sbl, valid_LI is 'cdeghkmnrt'.
    // The minimum and maximum code points are 99 and 116,
    // so every time one of these grouping functions is called for g_valid_LI,
    // min must be 99 and max must be 116. There are 18 code points within that
    // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding:
    //
    // cdefghij klmnopqr st
    // 11101100 10110001 01000000
    //
    // The first bit is the least significant.
    // Those three bytes become &[0b00110111, 0b10001101, 0b00000010],
    // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs.
    /// Check if the char the cursor points to is in the grouping
    pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
        if self.cursor >= self.limit {
            return false;
        }
        if let Some(chr) = self.current[self.cursor..].chars().next() {
            let mut ch = chr as u32; //codepoint as integer
            if ch > max || ch < min {
                return false;
            }
            ch -= min;
            if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
                return false;
            }
            self.next_char();
            return true;
        }
        return false;
    }

    pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
        if self.cursor <= self.limit_backward {
            return false;
        }
        self.previous_char();
        if let Some(chr) = self.current[self.cursor..].chars().next() {
            let mut ch = chr as u32; //codepoint as integer
            self.next_char();
            if ch > max || ch < min {
                return false;
            }
            ch -= min;
            if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
                return false;
            }
            self.previous_char();
            return true;
        }
        return false;
    }

    pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
        if self.cursor >= self.limit {
            return false;
        }
        if let Some(chr) = self.current[self.cursor..].chars().next() {
            let mut ch = chr as u32; //codepoint as integer
            if ch > max || ch < min {
                self.next_char();
                return true;
            }
            ch -= min;
            if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
                self.next_char();
                return true;
            }
        }
        return false;
    }

    pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool {
        if self.cursor <= self.limit_backward {
            return false;
        }
        self.previous_char();
        if let Some(chr) = self.current[self.cursor..].chars().next() {
            let mut ch = chr as u32; //codepoint as integer
            self.next_char();
            if ch > max || ch < min {
                self.previous_char();
                return true;
            }
            ch -= min;
            if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 {
                self.previous_char();
                return true;
            }
        }
        return false;

    }


    /// Helper function that removes the string slice between `bra` and `ket`
    pub fn slice_del(&mut self) -> bool {
        self.slice_from("")
    }

    pub fn insert(&mut self, bra: usize, ket: usize, s: &str) {
        let adjustment = self.replace_s(bra, ket, s);
        if bra <= self.bra {
            self.bra = (self.bra as i32 + adjustment) as usize;
        }
        if bra <= self.ket {
            self.ket = (self.ket as i32 + adjustment) as usize;
        }
    }

    pub fn slice_to(&mut self) -> String {
        self.current[self.bra..self.ket].to_string()
    }

    pub fn find_among<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
        use std::cmp::min;
        let mut i: i32 = 0;
        let mut j: i32 = amongs.len() as i32;

        let c = self.cursor;
        let l = self.limit;

        let mut common_i = 0;
        let mut common_j = 0;

        let mut first_key_inspected = false;
        loop {
            let k = i + ((j - i) >> 1);
            let mut diff: i32 = 0;
            let mut common = min(common_i, common_j);
            let w = &amongs[k as usize];
            for lvar in common..w.0.len() {
                if c + common == l {
                    diff = -1;
                    break;
                }
                diff = self.current.as_bytes()[c + common] as i32 - w.0.as_bytes()[lvar] as i32;
                if diff != 0 {
                    break;
                }
                common += 1;
            }
            if diff < 0 {
                j = k;
                common_j = common;
            } else {
                i = k;
                common_i = common;
            }
            if j - i <= 1 {
                if i > 0 {
                    break;
                }
                if j == i {
                    break;
                }
                if first_key_inspected {
                    break;
                }
                first_key_inspected = true;
            }
        }

        loop {
            let w = &amongs[i as usize];
            if common_i >= w.0.len() {
                self.cursor = c + w.0.len();
                if let Some(ref method) = w.3 {
                    let res = method(self, context);
                    self.cursor = c + w.0.len();
                    if res {
                        return w.2;
                    }
                } else {
                    return w.2;
                }
            }
            i = w.1;
            if i < 0 {
                return 0;
            }
        }
    }

    pub fn find_among_b<T>(&mut self, amongs: &[Among<T>], context: &mut T) -> i32 {
        let mut i: i32 = 0;
        let mut j: i32 = amongs.len() as i32;

        let c = self.cursor;
        let lb = self.limit_backward;

        let mut common_i = 0;
        let mut common_j = 0;

        let mut first_key_inspected = false;

        loop {
            let k = i + ((j - i) >> 1);
            let mut diff: i32 = 0;
            let mut common = if common_i < common_j {
                common_i
            } else {
                common_j
            };
            let w = &amongs[k as usize];
            for lvar in (0..w.0.len() - common).rev() {
                if c - common == lb {
                    diff = -1;
                    break;
                }
                diff = self.current.as_bytes()[c - common - 1] as i32 - w.0.as_bytes()[lvar] as i32;
                if diff != 0 {
                    break;
                }
                common += 1;
            }
            if diff < 0 {
                j = k;
                common_j = common;
            } else {
                i = k;
                common_i = common;
            }
            if j - i <= 1 {
                if i > 0 {
                    break;
                }
                if j == i {
                    break;
                }
                if first_key_inspected {
                    break;
                }
                first_key_inspected = true;
            }
        }
        loop {
            let w = &amongs[i as usize];
            if common_i >= w.0.len() {
                self.cursor = c - w.0.len();
                if let Some(ref method) = w.3 {
                    let res = method(self, context);
                    self.cursor = c - w.0.len();
                    if res {
                        return w.2;
                    }
                } else {
                    return w.2;
                }
            }
            i = w.1;
            if i < 0 {
                return 0;
            }
        }
    }
}