use crate::dict::Dict;
use crate::edge::{Edge, EdgeType};
use crate::text_range::TextRange;
use regex_automata::meta::Regex;
use std::iter::Peekable;
pub trait EdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge>;
}
#[derive(Debug)]
pub struct EdgeBuildingContext<'a> {
pub text: &'a [char],
pub i: usize,
pub ch: char,
pub left_boundary: usize,
pub best_edge: Option<Edge>,
}
pub struct UnkEdgeBuilder {}
impl UnkEdgeBuilder {
pub fn new() -> UnkEdgeBuilder {
UnkEdgeBuilder {}
}
}
impl EdgeBuilder for UnkEdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
if context.best_edge.is_some() {
return None;
}
let source = path[context.left_boundary];
Some(Edge {
p: context.left_boundary,
etype: EdgeType::Unk,
unk: source.unk + 1,
w: source.w + 1,
})
}
}
#[derive(Clone)]
pub struct Pointer {
pub node_id: usize,
pub s: usize,
pub offset: usize,
pub is_final: bool,
}
impl Pointer {
fn update(&mut self, dict: &Dict, ch: char) -> bool {
match dict.seek(&(self.node_id as u32, self.offset as u32, ch)) {
None => false,
Some(&(child_id, is_final, _)) => {
self.node_id = child_id as usize;
self.is_final = is_final;
self.offset += 1;
true
}
}
}
fn gen_edge(&self, path: &[Edge]) -> Edge {
let source = path[self.s];
Edge {
etype: EdgeType::Dict,
p: self.s,
w: source.w + 1,
unk: source.unk,
}
}
}
pub struct DictEdgeBuilder<'a> {
pub dict: &'a Dict,
pub pointers: Vec<Pointer>,
}
impl<'a> DictEdgeBuilder<'a> {
pub fn new(dict: &Dict) -> DictEdgeBuilder<'_> {
const MAX_SIZE: usize = 0xFF;
DictEdgeBuilder {
dict,
pointers: Vec::with_capacity(MAX_SIZE),
}
}
pub fn add_pointer(&mut self, context: &EdgeBuildingContext) {
self.pointers.push(Pointer {
node_id: 0,
offset: 0,
is_final: false,
s: context.i,
});
}
pub fn update_pointers(&mut self, context: &EdgeBuildingContext) {
let mut j = 0;
for i in 0..self.pointers.len() {
let valid = self.pointers[i].update(self.dict, context.ch);
if valid {
if j < i {
self.pointers[j] = self.pointers[i].clone()
}
j += 1
}
}
self.pointers.truncate(j);
}
fn gen_edge(&self, pointers: &[Pointer], path: &[Edge]) -> Option<Edge> {
let mut best_edge: Option<Edge> = None;
for pointer in pointers {
if pointer.is_final {
let edge = pointer.gen_edge(path);
if best_edge.is_none() {
best_edge = Some(edge)
} else if edge.better_than(&best_edge.unwrap()) {
best_edge = Some(edge)
}
}
}
best_edge
}
}
impl<'a> EdgeBuilder for DictEdgeBuilder<'a> {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
self.add_pointer(context);
self.update_pointers(context);
self.gen_edge(&self.pointers, path)
}
}
pub struct RuleBasedEdgeBuilder {
range_peekable: Peekable<std::vec::IntoIter<TextRange>>,
}
impl RuleBasedEdgeBuilder {
pub fn new(byte_to_char_idx_map: &[usize], text: &str, re: &Regex) -> Self {
let mut ranges = vec![];
for m in re.find_iter(text.as_bytes()) {
let ms = m.start();
let me = m.end();
let s = byte_to_char_idx_map[ms];
let e = byte_to_char_idx_map[me];
ranges.push(TextRange { s, e });
}
RuleBasedEdgeBuilder {
range_peekable: ranges.into_iter().peekable(),
}
}
}
impl EdgeBuilder for RuleBasedEdgeBuilder {
fn build(&mut self, context: &EdgeBuildingContext, path: &[Edge]) -> Option<Edge> {
loop {
if let Some(r) = self.range_peekable.peek() {
if context.i >= r.e {
self.range_peekable.next();
} else {
break;
}
} else {
return None;
}
}
if let Some(r) = self.range_peekable.peek() {
if r.e != context.i + 1 {
return None;
}
let source = path[r.s];
Some(Edge {
etype: EdgeType::Pat,
p: r.s,
w: source.w + 1,
unk: source.unk,
})
} else {
None
}
}
}