wordcut_engine/
edge_builders.rs1use crate::dict::Dict;
2use crate::edge::{Edge, EdgeType};
3use crate::text_range::TextRange;
4use regex_automata::meta::Regex;
5use std::iter::Peekable;
6
7pub trait EdgeBuilder {
8 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
9}
10
11#[derive(Debug)]
12pub struct EdgeBuildingContext<'a> {
13 pub text: &'a [char],
14 pub i: usize,
15 pub ch: char,
16 pub left_boundary: usize,
17 pub best_edge: Option<Edge>,
18}
19
20pub struct UnkEdgeBuilder {}
21
22impl UnkEdgeBuilder {
23 pub fn new() -> UnkEdgeBuilder {
24 UnkEdgeBuilder {}
25 }
26}
27
28impl EdgeBuilder for UnkEdgeBuilder {
29 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
30 if context.best_edge.is_some() {
31 return None;
32 }
33
34 let source = path[context.left_boundary];
35 Some(Edge {
36 p: context.left_boundary,
37 etype: EdgeType::Unk,
38 unk: source.unk + 1,
39 w: source.w + 1,
40 })
41 }
42}
43
44#[derive(Clone)]
45pub struct Pointer {
46 pub node_id: usize,
47 pub s: usize,
48 pub offset: usize,
49 pub is_final: bool,
50}
51
52impl Pointer {
53 fn update(&mut self, dict: &Dict, ch: char) -> bool {
54 match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
55 None => false,
56 Some(&(child_id, is_final, _)) => {
57 self.node_id = child_id as usize;
58 self.is_final = is_final;
59 self.offset += 1;
60 true
61 }
62 }
63 }
64
65 fn gen_edge(&self, path: &[Edge]) -> Edge {
66 let source = path[self.s];
67 Edge {
68 etype: EdgeType::Dict,
69 p: self.s,
70 w: source.w + 1,
71 unk: source.unk,
72 }
73 }
74}
75
76pub struct DictEdgeBuilder<'a> {
77 pub dict: &'a Dict,
78 pub pointers: Vec<Pointer>,
79}
80
81impl<'a> DictEdgeBuilder<'a> {
82 pub fn new(dict: &Dict) -> DictEdgeBuilder {
83 const MAX_SIZE: usize = 0xFF;
84 DictEdgeBuilder {
85 dict,
86 pointers: Vec::with_capacity(MAX_SIZE),
87 }
88 }
89
90 pub fn add_pointer(&mut self, context: &EdgeBuildingContext) {
91 self.pointers.push(Pointer {
92 node_id: 0,
93 offset: 0,
94 is_final: false,
95 s: context.i,
96 });
97 }
98
99 pub fn update_pointers(&mut self, context: &EdgeBuildingContext) {
100 let mut j = 0;
101 for i in 0..self.pointers.len() {
102 let valid = self.pointers[i].update(self.dict, context.ch);
103 if valid {
104 if j < i {
105 self.pointers[j] = self.pointers[i].clone()
106 }
107 j += 1
108 }
109 }
110 self.pointers.truncate(j);
111 }
112
113 fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
114 let mut best_edge: Option<Edge> = None;
115 for pointer in pointers {
116 if pointer.is_final {
117 let edge = pointer.gen_edge(path);
118 if best_edge.is_none() {
119 best_edge = Some(edge)
120 } else if edge.better_than(&best_edge.unwrap()) {
121 best_edge = Some(edge)
122 }
123 }
124 }
125 best_edge
126 }
127}
128
129impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
130 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
131 self.add_pointer(context);
132 self.update_pointers(context);
133 self.gen_edge(&self.pointers, path)
134 }
135}
136
137pub struct RuleBasedEdgeBuilder {
138 range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
139}
140
141impl RuleBasedEdgeBuilder {
142 pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
143 let mut ranges = vec![];
144 for m in re.find_iter(text.as_bytes()) {
145 let ms = m.start();
146 let me = m.end();
147 let s = byte_to_char_idx_map[ms];
148 let e = byte_to_char_idx_map[me];
149 ranges.push(TextRange { s, e });
150 }
151 RuleBasedEdgeBuilder {
152 range_peekable: ranges.into_iter().peekable(),
153 }
154 }
155}
156
157impl EdgeBuilder for RuleBasedEdgeBuilder {
158 fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
159 loop {
160 if let Some(r) = self.range_peekable.peek() {
161 if context.i >= r.e {
162 self.range_peekable.next();
163 } else {
164 break;
165 }
166 } else {
167 return None;
168 }
169 }
170 if let Some(r) = self.range_peekable.peek() {
171 if r.e != context.i + 1 {
172 return None;
173 }
174 let source = path[r.s];
175 Some(Edge {
176 etype: EdgeType::Pat,
177 p: r.s,
178 w: source.w + 1,
179 unk: source.unk,
180 })
181 } else {
182 None
183 }
184 }
185}