mod visualize;
pub use visualize::{DotBuilder, DotConfig, NodeShape, RankDir};
use crate::Result;
use crate::dict::{CharCategory, Dictionary, DictionaryEntry};
#[derive(Debug, Clone)]
pub struct LatticeNode<'a> {
pub surface: &'a str,
pub start: usize,
pub end: usize,
pub word_id: u32,
pub left_id: u16,
pub right_id: u16,
pub pos_id: u16,
pub wcost: i16,
pub feature: String,
pub is_unknown: bool,
}
impl<'a> LatticeNode<'a> {
pub fn from_entry(
text: &'a str,
start: usize,
entry: &DictionaryEntry,
feature: String,
) -> Self {
Self {
surface: &text[start..start + entry.length],
start,
end: start + entry.length,
word_id: entry.word_id,
left_id: entry.left_id,
right_id: entry.right_id,
pos_id: entry.pos_id,
wcost: entry.wcost,
feature,
is_unknown: false,
}
}
#[inline]
pub fn from_entry_owned(text: &'a str, start: usize, entry: DictionaryEntry) -> Self {
let end = start + entry.length;
Self {
surface: &text[start..end],
start,
end,
word_id: entry.word_id,
left_id: entry.left_id,
right_id: entry.right_id,
pos_id: entry.pos_id,
wcost: entry.wcost,
feature: entry.feature, is_unknown: false,
}
}
pub fn bos() -> Self {
Self {
surface: "",
start: 0,
end: 0,
word_id: u32::MAX, left_id: 0,
right_id: 0,
pos_id: 0,
wcost: 0,
feature: "BOS/EOS".to_string(),
is_unknown: false,
}
}
pub fn eos(position: usize) -> Self {
Self {
surface: "",
start: position,
end: position,
word_id: u32::MAX, left_id: 0,
right_id: 0,
pos_id: 0,
wcost: 0,
feature: "BOS/EOS".to_string(),
is_unknown: false,
}
}
pub fn unknown(
text: &'a str,
start: usize,
length: usize,
entry: &DictionaryEntry,
feature: String,
) -> Self {
Self {
surface: &text[start..start + length],
start,
end: start + length,
word_id: entry.word_id,
left_id: entry.left_id,
right_id: entry.right_id,
pos_id: entry.pos_id,
wcost: entry.wcost,
feature,
is_unknown: true,
}
}
}
#[derive(Debug)]
pub struct Lattice<'a> {
pub text: &'a str,
pub nodes_at: Vec<Vec<LatticeNode<'a>>>,
}
impl<'a> Lattice<'a> {
pub fn build(text: &'a str, dict: &Dictionary) -> Result<Self> {
let text_len = text.len();
let mut nodes_at: Vec<Vec<LatticeNode<'a>>> = vec![Vec::new(); text_len + 2];
nodes_at[0].push(LatticeNode::bos());
for (char_idx, c) in text.char_indices() {
let pos = char_idx;
let remaining = &text[pos..];
let entries = dict.lookup(remaining);
if entries.is_empty() {
Self::add_unknown_nodes(text, pos, c, dict, &mut nodes_at);
} else {
for entry in entries {
let node = LatticeNode::from_entry_owned(text, pos, entry);
let end_pos = node.end;
if end_pos <= text_len {
nodes_at[end_pos + 1].push(node);
}
}
}
}
let final_pos = text.len();
if nodes_at[final_pos + 1].is_empty() && !nodes_at[final_pos].is_empty() {
}
nodes_at[text_len + 1].push(LatticeNode::eos(text_len));
Ok(Self { text, nodes_at })
}
fn add_unknown_nodes(
text: &'a str,
pos: usize,
c: char,
dict: &Dictionary,
nodes_at: &mut [Vec<LatticeNode<'a>>],
) {
let category = dict.char_category(c);
let entries = dict.unknown.generate_entries(category, c.len_utf8());
if entries.is_empty() {
let char_len = c.len_utf8();
let end_pos = pos + char_len;
if end_pos <= text.len() {
nodes_at[end_pos + 1].push(LatticeNode {
surface: &text[pos..end_pos],
start: pos,
end: end_pos,
word_id: u32::MAX, left_id: 0,
right_id: 0,
pos_id: 0,
wcost: 10000, feature: format!("未知語,{category:?}"),
is_unknown: true,
});
}
} else {
for entry in &entries {
let feature = entry.feature.clone();
let node = LatticeNode::unknown(text, pos, entry.length, entry, feature);
let end_pos = node.end;
if end_pos <= text.len() {
nodes_at[end_pos + 1].push(node);
}
}
}
if dict.char_def.should_group(category) {
Self::add_grouped_unknown(text, pos, category, dict, nodes_at);
}
}
fn add_grouped_unknown(
text: &'a str,
start: usize,
category: CharCategory,
dict: &Dictionary,
nodes_at: &mut [Vec<LatticeNode<'a>>],
) {
let remaining = &text[start..];
let mut length = 0;
let mut char_count = 0;
for c in remaining.chars() {
if dict.char_category(c) != category {
break;
}
length += c.len_utf8();
char_count += 1;
if char_count > 1 {
let entries = dict.unknown.generate_entries(category, length);
for entry in &entries {
let feature = entry.feature.clone();
let node = LatticeNode::unknown(text, start, length, entry, feature);
let end_pos = start + length;
if end_pos <= text.len()
&& !nodes_at[end_pos + 1]
.iter()
.any(|n| n.start == start && n.end == end_pos)
{
nodes_at[end_pos + 1].push(node);
}
}
}
}
}
pub fn len(&self) -> usize {
self.nodes_at.len()
}
pub fn is_empty(&self) -> bool {
self.nodes_at.is_empty()
}
pub fn nodes_ending_at(&self, pos: usize) -> &[LatticeNode<'a>] {
if pos < self.nodes_at.len() {
&self.nodes_at[pos]
} else {
&[]
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bos_eos_nodes() {
let bos = LatticeNode::bos();
assert_eq!(bos.surface, "");
assert_eq!(bos.start, 0);
assert_eq!(bos.end, 0);
let eos = LatticeNode::eos(10);
assert_eq!(eos.surface, "");
assert_eq!(eos.start, 10);
assert_eq!(eos.end, 10);
}
}