wordcut_engine/
edge_builders.rs

1use crate::dict::Dict;
2use crate::edge::{Edge, EdgeType};
3use crate::text_range::TextRange;
4use regex_automata::meta::Regex;
5use std::iter::Peekable;
6
7pub trait EdgeBuilder {
8    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
9}
10
11#[derive(Debug)]
12pub struct EdgeBuildingContext<'a> {
13    pub text: &'a [char],
14    pub i: usize,
15    pub ch: char,
16    pub left_boundary: usize,
17    pub best_edge: Option<Edge>,
18}
19
20pub struct UnkEdgeBuilder {}
21
22impl UnkEdgeBuilder {
23    pub fn new() -> UnkEdgeBuilder {
24        UnkEdgeBuilder {}
25    }
26}
27
28impl EdgeBuilder for UnkEdgeBuilder {
29    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
30        if context.best_edge.is_some() {
31            return None;
32        }
33
34        let source = path[context.left_boundary];
35        Some(Edge {
36            p: context.left_boundary,
37            etype: EdgeType::Unk,
38            unk: source.unk + 1,
39            w: source.w + 1,
40        })
41    }
42}
43
44#[derive(Clone)]
45pub struct Pointer {
46    pub node_id: usize,
47    pub s: usize,
48    pub offset: usize,
49    pub is_final: bool,
50}
51
52impl Pointer {
53    fn update(&mut self, dict: &Dict, ch: char) -> bool {
54        match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
55            None => false,
56            Some(&(child_id, is_final, _)) => {
57                self.node_id = child_id as usize;
58                self.is_final = is_final;
59                self.offset += 1;
60                true
61            }
62        }
63    }
64
65    fn gen_edge(&self, path: &[Edge]) -> Edge {
66        let source = path[self.s];
67        Edge {
68            etype: EdgeType::Dict,
69            p: self.s,
70            w: source.w + 1,
71            unk: source.unk,
72        }
73    }
74}
75
76pub struct DictEdgeBuilder<'a> {
77    pub dict: &'a Dict,
78    pub pointers: Vec<Pointer>,
79}
80
81impl<'a> DictEdgeBuilder<'a> {
82    pub fn new(dict: &Dict) -> DictEdgeBuilder {
83        const MAX_SIZE: usize = 0xFF;
84        DictEdgeBuilder {
85            dict,
86            pointers: Vec::with_capacity(MAX_SIZE),
87        }
88    }
89
90    pub fn add_pointer(&mut self, context: &EdgeBuildingContext) {
91        self.pointers.push(Pointer {
92            node_id: 0,
93            offset: 0,
94            is_final: false,
95            s: context.i,
96        });
97    }
98
99    pub fn update_pointers(&mut self, context: &EdgeBuildingContext) {
100        let mut j = 0;
101        for i in 0..self.pointers.len() {
102            let valid = self.pointers[i].update(self.dict, context.ch);
103            if valid {
104                if j < i {
105                    self.pointers[j] = self.pointers[i].clone()
106                }
107                j += 1
108            }
109        }
110        self.pointers.truncate(j);
111    }
112
113    fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
114        let mut best_edge: Option<Edge> = None;
115        for pointer in pointers {
116            if pointer.is_final {
117                let edge = pointer.gen_edge(path);
118                if best_edge.is_none() {
119                    best_edge = Some(edge)
120                } else if edge.better_than(&best_edge.unwrap()) {
121                    best_edge = Some(edge)
122                }
123            }
124        }
125        best_edge
126    }
127}
128
129impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
130    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
131        self.add_pointer(context);
132        self.update_pointers(context);
133        self.gen_edge(&self.pointers, path)
134    }
135}
136
137pub struct RuleBasedEdgeBuilder {
138    range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
139}
140
141impl RuleBasedEdgeBuilder {
142    pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
143        let mut ranges = vec![];
144        for m in re.find_iter(text.as_bytes()) {
145            let ms = m.start();
146            let me = m.end();
147            let s = byte_to_char_idx_map[ms];
148            let e = byte_to_char_idx_map[me];
149            ranges.push(TextRange { s, e });
150        }
151        RuleBasedEdgeBuilder {
152            range_peekable: ranges.into_iter().peekable(),
153        }
154    }
155}
156
157impl EdgeBuilder for RuleBasedEdgeBuilder {
158    fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
159        loop {
160            if let Some(r) = self.range_peekable.peek() {
161                if context.i >= r.e {
162                    self.range_peekable.next();
163                } else {
164                    break;
165                }
166            } else {
167                return None;
168            }
169        }
170        if let Some(r) = self.range_peekable.peek() {
171            if r.e != context.i + 1 {
172                return None;
173            }
174            let source = path[r.s];
175            Some(Edge {
176                etype: EdgeType::Pat,
177                p: r.s,
178                w: source.w + 1,
179                unk: source.unk,
180            })
181        } else {
182            None
183        }
184    }
185}