wordcut-engine 1.2.2

Word segmentation/breaking library
Documentation
use crate::dict::Dict;
use crate::edge::{Edge, EdgeType};
use crate::text_range::TextRange;
use regex_automata::meta::Regex;
use std::iter::Peekable;

pub trait EdgeBuilder {
    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
}

#[derive(Debug)]
pub struct EdgeBuildingContext<'a> {
    pub text: &'a [char],
    pub i: usize,
    pub ch: char,
    pub left_boundary: usize,
    pub best_edge: Option<Edge>,
}

pub struct UnkEdgeBuilder {}

impl UnkEdgeBuilder {
    pub fn new() -> UnkEdgeBuilder {
        UnkEdgeBuilder {}
    }
}

impl EdgeBuilder for UnkEdgeBuilder {
    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
        if context.best_edge.is_some() {
            return None;
        }

        let source = path[context.left_boundary];
        Some(Edge {
            p: context.left_boundary,
            etype: EdgeType::Unk,
            unk: source.unk + 1,
            w: source.w + 1,
        })
    }
}

#[derive(Clone)]
pub struct Pointer {
    pub node_id: usize,
    pub s: usize,
    pub offset: usize,
    pub is_final: bool,
}

impl Pointer {
    fn update(&mut self, dict: &Dict, ch: char) -> bool {
        match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
            None => false,
            Some(&(child_id, is_final, _)) => {
                self.node_id = child_id as usize;
                self.is_final = is_final;
                self.offset += 1;
                true
            }
        }
    }

    fn gen_edge(&self, path: &[Edge]) -> Edge {
        let source = path[self.s];
        Edge {
            etype: EdgeType::Dict,
            p: self.s,
            w: source.w + 1,
            unk: source.unk,
        }
    }
}

pub struct DictEdgeBuilder<'a> {
    pub dict: &'a Dict,
    pub pointers: Vec<Pointer>,
}

impl<'a> DictEdgeBuilder<'a> {
    pub fn new(dict: &Dict) -> DictEdgeBuilder<'_> {
        const MAX_SIZE: usize = 0xFF;
        DictEdgeBuilder {
            dict,
            pointers: Vec::with_capacity(MAX_SIZE),
        }
    }

    pub fn add_pointer(&mut self, context: &EdgeBuildingContext) {
        self.pointers.push(Pointer {
            node_id: 0,
            offset: 0,
            is_final: false,
            s: context.i,
        });
    }

    pub fn update_pointers(&mut self, context: &EdgeBuildingContext) {
        let mut j = 0;
        for i in 0..self.pointers.len() {
            let valid = self.pointers[i].update(self.dict, context.ch);
            if valid {
                if j < i {
                    self.pointers[j] = self.pointers[i].clone()
                }
                j += 1
            }
        }
        self.pointers.truncate(j);
    }

    fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
        let mut best_edge: Option<Edge> = None;
        for pointer in pointers {
            if pointer.is_final {
                let edge = pointer.gen_edge(path);
                if best_edge.is_none() {
                    best_edge = Some(edge)
                } else if edge.better_than(&best_edge.unwrap()) {
                    best_edge = Some(edge)
                }
            }
        }
        best_edge
    }
}

impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
        self.add_pointer(context);
        self.update_pointers(context);
        self.gen_edge(&self.pointers, path)
    }
}

pub struct RuleBasedEdgeBuilder {
    range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
}

impl RuleBasedEdgeBuilder {
    pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
        let mut ranges = vec![];
        for m in re.find_iter(text.as_bytes()) {
            let ms = m.start();
            let me = m.end();
            let s = byte_to_char_idx_map[ms];
            let e = byte_to_char_idx_map[me];
            ranges.push(TextRange { s, e });
        }
        RuleBasedEdgeBuilder {
            range_peekable: ranges.into_iter().peekable(),
        }
    }
}

impl EdgeBuilder for RuleBasedEdgeBuilder {
    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
        loop {
            if let Some(r) = self.range_peekable.peek() {
                if context.i >= r.e {
                    self.range_peekable.next();
                } else {
                    break;
                }
            } else {
                return None;
            }
        }
        if let Some(r) = self.range_peekable.peek() {
            if r.e != context.i + 1 {
                return None;
            }
            let source = path[r.s];
            Some(Edge {
                etype: EdgeType::Pat,
                p: r.s,
                w: source.w + 1,
                unk: source.unk,
            })
        } else {
            None
        }
    }
}