use std::{
cmp::Ordering,
collections::VecDeque,
error::Error,
fmt::Debug,
fs::{self, File},
io::{self, BufWriter, Read, Write},
iter,
num::NonZeroUsize,
path::{Path, PathBuf},
};
use der::{
DecodeValue, Document, Encode, EncodeValue, ErrorKind, FixedTag, Length, Reader, Sequence,
SliceReader, Tag, TagMode, TagNumber, Tagged, Writer,
asn1::{ContextSpecificRef, OctetStringRef, Utf8StringRef},
};
use log::{debug, error};
use super::{
BuildDictionaryError, Dictionary, DictionaryBuilder, DictionaryInfo, Entries, LookupStrategy,
Phrase,
};
use crate::{dictionary::DictionaryUsage, exn::ResultExt, zhuyin::Syllable};
const DICT_FORMAT_VERSION: u8 = 0;
struct TrieNodeView<'a>(&'a [u8]);
impl TrieNodeView<'_> {
const SIZE: usize = 8;
fn syllable(&self) -> u16 {
u16::from_be_bytes(self.0[6..8].try_into().unwrap())
}
fn child_begin(&self) -> usize {
u32::from_be_bytes(self.0[..4].try_into().unwrap()) as usize * Self::SIZE
}
fn child_end(&self) -> usize {
(u32::from_be_bytes(self.0[..4].try_into().unwrap()) as usize)
.saturating_add(u16::from_be_bytes(self.0[4..6].try_into().unwrap()) as usize)
* Self::SIZE
}
}
struct TrieLeafView<'a>(&'a [u8]);
impl TrieLeafView<'_> {
const SIZE: usize = 8;
fn reserved_zero(&self) -> u16 {
u16::from_be_bytes(self.0[6..8].try_into().unwrap())
}
fn data_begin(&self) -> usize {
u32::from_be_bytes(self.0[..4].try_into().unwrap()) as usize
}
fn data_end(&self) -> usize {
(u32::from_be_bytes(self.0[..4].try_into().unwrap()) as usize)
.saturating_add(u16::from_be_bytes(self.0[4..6].try_into().unwrap()) as usize)
}
}
#[derive(Debug, Clone)]
pub struct Trie {
info: DictionaryInfo,
path: Option<PathBuf>,
index: Box<[u8]>,
phrase_seq: Box<[u8]>,
fuzzy_search: bool,
}
fn io_error(e: impl Into<Box<dyn Error + Send + Sync>>) -> io::Error {
io::Error::other(e)
}
impl Trie {
pub fn open<P: AsRef<Path>>(path: P) -> io::Result<Trie> {
TrieOpenOptions::new().open(path)
}
pub fn new<T>(stream: T) -> io::Result<Trie>
where
T: Read,
{
TrieOpenOptions::new().read_from(stream)
}
pub fn enable_fuzzy_search(&mut self, fuzzy_search: bool) {
self.fuzzy_search = fuzzy_search;
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub struct TrieOpenOptions {
fuzzy_search: bool,
}
impl TrieOpenOptions {
pub fn new() -> TrieOpenOptions {
TrieOpenOptions::default()
}
pub fn fuzzy_search(&mut self, fuzzy_search: bool) -> &mut Self {
self.fuzzy_search = fuzzy_search;
self
}
pub fn open<P: AsRef<Path>>(&self, path: P) -> io::Result<Trie> {
let path = path.as_ref().to_path_buf();
let mut file = File::open(&path)?;
let mut trie = self.read_from(&mut file)?;
trie.path = Some(path);
Ok(trie)
}
pub fn read_from<T>(&self, mut stream: T) -> io::Result<Trie>
where
T: Read,
{
let mut buf = vec![];
stream.read_to_end(&mut buf)?;
let trie_dict_doc = Document::try_from(buf).map_err(io_error)?;
let trie_ref: TrieFileRef<'_> = trie_dict_doc.decode_msg().map_err(io_error)?;
let info = trie_ref.info.into();
let index = trie_ref.index.as_bytes().into();
let phrase_seq = trie_ref.phrase_seq.der_bytes.into();
Ok(Trie {
info,
path: None,
index,
phrase_seq,
fuzzy_search: self.fuzzy_search,
})
}
}
struct PhrasesIter<'a> {
reader: SliceReader<'a>,
}
impl PhrasesIter<'_> {
fn new(bytes: &[u8]) -> PhrasesIter<'_> {
PhrasesIter {
reader: SliceReader::new(bytes).unwrap(),
}
}
}
impl Iterator for PhrasesIter<'_> {
type Item = Phrase;
#[inline(always)]
fn next(&mut self) -> Option<Self::Item> {
if self.reader.is_finished() {
return None;
}
self.reader.decode().ok()
}
}
macro_rules! bail_if_oob {
($begin:expr, $end:expr, $len:expr) => {
if $begin >= $end || $end > $len {
error!("[!] file corruption detected: index out of bound.");
return vec![];
}
};
}
macro_rules! iter_bail_if_oob {
($begin:expr, $end:expr, $len:expr) => {
if $begin >= $end || $end > $len {
error!("[!] file corruption detected: index out of bound.");
return None;
}
};
}
impl Dictionary for Trie {
fn lookup(&self, syllables: &[Syllable], strategy: LookupStrategy) -> Vec<Phrase> {
let dict = self.index.as_ref();
let data = self.phrase_seq.as_ref();
bail_if_oob!(0, TrieNodeView::SIZE, dict.len());
let root = TrieNodeView(&dict[..TrieNodeView::SIZE]);
if root.child_begin() == root.child_end() {
return vec![];
}
let search_predicate = match strategy {
LookupStrategy::Standard => |n: u16, syl: &Syllable| n == syl.to_u16(),
LookupStrategy::FuzzyPartialPrefix => |n: u16, syl: &Syllable| {
if n == 0 {
return false;
}
if let Ok(syllable) = Syllable::try_from(n) {
syllable.starts_with(*syl)
} else {
false
}
},
};
let mut threads: VecDeque<TrieNodeView<'_>> = VecDeque::new();
threads.push_back(root);
for syl in syllables {
debug_assert!(syl.to_u16() != 0);
for _ in 0..threads.len() {
let node = threads.pop_front().unwrap();
bail_if_oob!(node.child_begin(), node.child_end(), dict.len());
let child_nodes = dict[node.child_begin()..node.child_end()]
.chunks_exact(TrieNodeView::SIZE)
.map(TrieNodeView);
for n in child_nodes {
if search_predicate(n.syllable(), syl) {
threads.push_back(n);
}
}
}
if threads.is_empty() {
return vec![];
}
}
let mut result = vec![];
for node in threads.into_iter() {
bail_if_oob!(node.child_begin(), node.child_end(), dict.len());
let leaf_data = &dict[node.child_begin()..];
bail_if_oob!(0, TrieLeafView::SIZE, leaf_data.len());
let leaf = TrieLeafView(&leaf_data[..TrieLeafView::SIZE]);
if leaf.reserved_zero() != 0 {
continue;
}
bail_if_oob!(leaf.data_begin(), leaf.data_end(), data.len());
result.extend(PhrasesIter::new(&data[leaf.data_begin()..leaf.data_end()]));
}
result
}
fn entries(&self) -> Entries<'_> {
let dict = self.index.as_ref();
let data = self.phrase_seq.as_ref();
let mut results = Vec::new();
let mut stack = Vec::new();
let mut syllables = Vec::new();
if dict.len() < TrieNodeView::SIZE {
error!("[!] file corruption detected: index out of bound.");
return Box::new(iter::empty());
}
let root = TrieNodeView(&dict[..TrieNodeView::SIZE]);
let mut node = root;
if node.child_begin() == node.child_end() {
return Box::new(iter::empty());
}
let make_dict_entry =
|syllables: &[u16], leaf: &TrieLeafView<'_>| -> (Vec<Syllable>, Vec<Phrase>) {
debug_assert_eq!(leaf.reserved_zero(), 0);
(
syllables
.iter()
.map(|&syl_u16| Syllable::try_from(syl_u16).unwrap())
.collect::<Vec<_>>(),
PhrasesIter::new(&data[leaf.data_begin()..leaf.data_end()]).collect::<Vec<_>>(),
)
};
let mut done = false;
let it = iter::from_fn(move || {
if !results.is_empty() {
return results.pop();
}
if done {
return None;
}
loop {
iter_bail_if_oob!(node.child_begin(), node.child_end(), dict.len());
let mut child_iter = dict[node.child_begin()..node.child_end()]
.chunks_exact(TrieNodeView::SIZE)
.map(TrieNodeView);
let mut next = child_iter
.next()
.expect("syllable node should have at least one child node");
if next.syllable() == 0 {
iter_bail_if_oob!(node.child_begin(), node.child_end(), dict.len());
let leaf_data = &dict[node.child_begin()..];
let leaf = TrieLeafView(&leaf_data[..TrieLeafView::SIZE]);
iter_bail_if_oob!(leaf.data_begin(), leaf.data_end(), data.len());
results.push(make_dict_entry(&syllables, &leaf));
if let Some(second) = child_iter.next() {
next = second;
} else {
break;
}
}
node = next;
syllables.push(node.syllable());
stack.push(child_iter);
}
loop {
if let Some(mut child_nodes) = stack.pop() {
syllables.pop();
if let Some(next) = child_nodes.next() {
debug_assert_ne!(next.syllable(), 0);
node = next;
stack.push(child_nodes);
syllables.push(node.syllable());
break;
}
} else {
done = true;
break;
}
}
results.pop()
});
let entries = it.flat_map(|(syllables, phrases)| {
phrases
.into_iter()
.map(move |phrase| (syllables.clone(), phrase))
});
Box::new(entries)
}
fn about(&self) -> DictionaryInfo {
self.info.clone()
}
fn path(&self) -> Option<&Path> {
self.path.as_ref().map(|p| p as &Path)
}
fn set_usage(&mut self, usage: DictionaryUsage) {
self.info.usage = usage;
}
}
fn context_specific<T: EncodeValue + Tagged>(
tag_number: u8,
tag_mode: TagMode,
value: &T,
) -> ContextSpecificRef<'_, T> {
ContextSpecificRef {
tag_number: TagNumber::new(tag_number),
tag_mode,
value,
}
}
fn context_specific_opt<T: EncodeValue + Tagged>(
tag_number: u8,
tag_mode: TagMode,
value: &Option<T>,
) -> Option<ContextSpecificRef<'_, T>> {
value
.as_ref()
.map(|value| context_specific(tag_number, tag_mode, value))
}
struct DictionaryInfoRef<'a> {
name: Utf8StringRef<'a>,
copyright: Utf8StringRef<'a>,
license: Utf8StringRef<'a>,
version: Utf8StringRef<'a>,
software: Utf8StringRef<'a>,
usage: DictionaryUsage,
}
impl From<DictionaryInfoRef<'_>> for DictionaryInfo {
fn from(value: DictionaryInfoRef<'_>) -> Self {
DictionaryInfo {
name: value.name.into(),
copyright: value.copyright.into(),
license: value.license.into(),
version: value.version.into(),
software: value.software.into(),
usage: value.usage.into(),
}
}
}
impl DictionaryInfoRef<'_> {
fn new(info: &DictionaryInfo) -> DictionaryInfoRef<'_> {
DictionaryInfoRef {
name: Utf8StringRef::new(&info.name).unwrap(),
copyright: Utf8StringRef::new(&info.copyright).unwrap(),
license: Utf8StringRef::new(&info.license).unwrap(),
version: Utf8StringRef::new(&info.version).unwrap(),
software: Utf8StringRef::new(&info.software).unwrap(),
usage: info.usage,
}
}
}
impl FixedTag for DictionaryInfoRef<'_> {
const TAG: Tag = Tag::Sequence;
}
impl<'a> DecodeValue<'a> for DictionaryInfoRef<'a> {
fn decode_value<R: Reader<'a>>(reader: &mut R, header: der::Header) -> der::Result<Self> {
reader.read_nested(header.length, |reader| {
let name = reader.decode()?;
let copyright = reader.decode()?;
let license = reader.decode()?;
let version = reader.decode()?;
let software = reader.decode()?;
let raw_usage = reader
.context_specific(TagNumber::N0, TagMode::Explicit)?
.unwrap_or(0);
let usage = DictionaryUsage::from(raw_usage);
let _ = reader.read_slice(reader.remaining_len());
Ok(DictionaryInfoRef {
name,
copyright,
license,
version,
software,
usage,
})
})
}
}
impl EncodeValue for DictionaryInfoRef<'_> {
fn value_len(&self) -> der::Result<Length> {
self.name.encoded_len()?
+ self.copyright.encoded_len()?
+ self.license.encoded_len()?
+ self.version.encoded_len()?
+ self.software.encoded_len()?
}
fn encode_value(&self, encoder: &mut impl Writer) -> der::Result<()> {
self.name.encode(encoder)?;
self.copyright.encode(encoder)?;
self.license.encode(encoder)?;
self.version.encode(encoder)?;
self.software.encode(encoder)?;
Ok(())
}
}
struct TrieFileRef<'a> {
info: DictionaryInfoRef<'a>,
index: OctetStringRef<'a>,
phrase_seq: PhraseSeqRef<'a>,
}
struct PhraseSeqRef<'a> {
der_bytes: &'a [u8],
}
impl<'a> Sequence<'a> for TrieFileRef<'a> {}
impl<'a> DecodeValue<'a> for TrieFileRef<'a> {
fn decode_value<R: Reader<'a>>(reader: &mut R, header: der::Header) -> der::Result<Self> {
reader.read_nested(header.length, |reader| {
let magic: Utf8StringRef<'_> = reader.decode()?;
let version: u8 = reader.decode()?;
if magic.as_str() != "CHEW" || version != DICT_FORMAT_VERSION {
return Err(ErrorKind::Value { tag: header.tag }.at(reader.position()));
}
let info = reader.decode()?;
let index = reader.decode()?;
let phrase_seq = reader.decode()?;
let _ = reader.read_slice(reader.remaining_len());
Ok(Self {
info,
index,
phrase_seq,
})
})
}
}
impl EncodeValue for TrieFileRef<'_> {
fn value_len(&self) -> der::Result<Length> {
Utf8StringRef::new("CHEW")?.encoded_len()?
+ DICT_FORMAT_VERSION.encoded_len()?
+ self.info.encoded_len()?
+ self.index.encoded_len()?
+ self.phrase_seq.encoded_len()?
}
fn encode_value(&self, encoder: &mut impl Writer) -> der::Result<()> {
Utf8StringRef::new("CHEW")?.encode(encoder)?;
DICT_FORMAT_VERSION.encode(encoder)?;
self.info.encode(encoder)?;
self.index.encode(encoder)?;
self.phrase_seq.encode(encoder)?;
Ok(())
}
}
impl FixedTag for Phrase {
const TAG: Tag = Tag::Sequence;
}
impl<'a> DecodeValue<'a> for Phrase {
fn decode_value<R: Reader<'a>>(reader: &mut R, header: der::Header) -> der::Result<Self> {
reader.read_nested(header.length, |reader| {
let phrase: Utf8StringRef<'_> = reader.decode()?;
let freq = reader.decode()?;
let last_used = reader.context_specific(TagNumber::N0, TagMode::Implicit)?;
let _ = reader.read_slice(reader.remaining_len());
Ok(Phrase {
text: String::from(phrase).into_boxed_str(),
freq,
last_used,
})
})
}
}
impl EncodeValue for Phrase {
fn value_len(&self) -> der::Result<Length> {
Utf8StringRef::new(self.as_str())?.encoded_len()?
+ self.freq.encoded_len()?
+ context_specific_opt(0, TagMode::Implicit, &self.last_used).encoded_len()?
}
fn encode_value(&self, encoder: &mut impl Writer) -> der::Result<()> {
Utf8StringRef::new(self.as_ref())?.encode(encoder)?;
self.freq.encode(encoder)?;
context_specific_opt(0, TagMode::Implicit, &self.last_used).encode(encoder)?;
Ok(())
}
}
impl FixedTag for PhraseSeqRef<'_> {
const TAG: Tag = Tag::Sequence;
}
impl EncodeValue for PhraseSeqRef<'_> {
fn value_len(&self) -> der::Result<Length> {
self.der_bytes.len().try_into()
}
fn encode_value(&self, encoder: &mut impl Writer) -> der::Result<()> {
encoder.write(self.der_bytes)
}
}
impl<'a> DecodeValue<'a> for PhraseSeqRef<'a> {
fn decode_value<R: Reader<'a>>(reader: &mut R, header: der::Header) -> der::Result<Self> {
reader.read_nested(header.length, |reader| {
let der_bytes = reader.read_slice(header.length)?;
Ok(Self { der_bytes })
})
}
}
#[derive(Default)]
struct VecWriter {
buf: Vec<u8>,
}
impl VecWriter {
fn new() -> VecWriter {
VecWriter::default()
}
fn len(&self) -> usize {
self.buf.len()
}
}
impl Writer for VecWriter {
fn write(&mut self, slice: &[u8]) -> der::Result<()> {
self.buf.write_all(slice)?;
Ok(())
}
}
#[doc = include_str!("trie.asn1")]
#[derive(Debug)]
pub struct TrieBuilder {
arena: Vec<TrieBuilderNode>,
info: DictionaryInfo,
}
#[derive(Debug, PartialEq, Default)]
struct TrieBuilderNode {
id: usize,
syllable: Option<Syllable>,
children: Vec<usize>,
leaf_id: Option<NonZeroUsize>,
phrases: Vec<Phrase>,
}
#[derive(Debug)]
pub struct TrieStatistics {
pub node_count: usize,
pub leaf_count: usize,
pub phrase_count: usize,
pub max_height: usize,
pub avg_height: usize,
pub root_branch_count: usize,
pub max_branch_count: usize,
pub avg_branch_count: usize,
}
impl TrieBuilder {
pub fn new() -> TrieBuilder {
let root = TrieBuilderNode::default();
TrieBuilder {
arena: vec![root],
info: Default::default(),
}
}
fn alloc_leaf(&mut self) -> usize {
let next_id = self.arena.len();
let leaf = TrieBuilderNode {
id: next_id,
..Default::default()
};
self.arena.push(leaf);
next_id
}
fn alloc_internal(&mut self, syl: Syllable) -> usize {
let next_id = self.arena.len();
let internal = TrieBuilderNode {
id: next_id,
syllable: Some(syl),
..Default::default()
};
self.arena.push(internal);
next_id
}
fn find_or_insert_internal(&mut self, syllables: &[Syllable]) -> usize {
let mut node_id = 0;
'next: for &syl in syllables {
for &child_node_id in &self.arena[node_id].children {
if self.arena[child_node_id].syllable == Some(syl) {
node_id = child_node_id;
continue 'next;
}
}
let next_id = self.alloc_internal(syl);
self.arena[node_id].children.push(next_id);
node_id = next_id;
}
if let Some(leaf_id) = self.arena[node_id].leaf_id {
node_id = leaf_id.get();
} else {
let leaf_id = self.alloc_leaf();
self.arena[node_id].leaf_id = NonZeroUsize::new(leaf_id);
node_id = leaf_id;
}
node_id
}
pub fn set_usage(&mut self, usage: DictionaryUsage) {
self.info.usage = usage;
}
pub fn write<T>(&self, mut writer: T) -> io::Result<usize>
where
T: Write,
{
const ROOT_ID: usize = 0;
let mut dict_buf = Vec::new();
let mut data_buf = VecWriter::new();
let mut queue = VecDeque::new();
let mut child_begin = 1;
queue.push_back(ROOT_ID);
while !queue.is_empty() {
let layer_nodes_count = queue.len();
for _ in 0..layer_nodes_count {
let id = queue.pop_front().unwrap();
let node = &self.arena[id];
if node.syllable.is_some() || id == ROOT_ID {
let syllable_u16 = node.syllable.map_or(0, |v| v.to_u16());
let child_len =
node.children.len() + if node.leaf_id.is_some() { 1 } else { 0 };
dict_buf.write_all(&(child_begin as u32).to_be_bytes())?;
dict_buf.write_all(&(child_len as u16).to_be_bytes())?;
dict_buf.write_all(&syllable_u16.to_be_bytes())?;
} else {
let mut phrases = node.phrases.clone();
phrases.sort_by(|a, b| {
match (a.as_str().chars().count(), b.as_str().chars().count()) {
(1, 1) => Ordering::Equal,
(1, _) | (_, 1) => a.as_str().len().cmp(&b.as_str().len()),
_ => {
if a.freq() == b.freq() {
b.as_str().cmp(a.as_str())
} else {
b.freq().cmp(&a.freq())
}
}
}
});
let data_begin = data_buf.len();
for phrase in phrases {
phrase.encode(&mut data_buf).map_err(io_error)?;
}
let data_len = data_buf.len() - data_begin;
dict_buf.write_all(&(data_begin as u32).to_be_bytes())?;
dict_buf.write_all(&(data_len as u16).to_be_bytes())?;
dict_buf.write_all(&0u16.to_be_bytes())?;
}
let mut children = node.children.clone();
children.sort_by(|&a, &b| self.arena[a].syllable.cmp(&self.arena[b].syllable));
if let Some(leaf_id) = node.leaf_id {
child_begin += 1;
queue.push_back(leaf_id.get());
}
for child_id in children {
child_begin += 1;
queue.push_back(child_id);
}
}
}
let info = DictionaryInfoRef::new(&self.info);
let trie_dict_ref = TrieFileRef {
info,
index: OctetStringRef::new(&dict_buf).map_err(io_error)?,
phrase_seq: PhraseSeqRef {
der_bytes: &data_buf.buf,
},
};
let document = Document::encode_msg(&trie_dict_ref).map_err(io_error)?;
writer.write_all(document.as_bytes())?;
Ok(document.as_bytes().len())
}
pub fn statistics(&self) -> TrieStatistics {
let mut node_count = 0;
let mut leaf_count = 0;
let mut phrase_count = 0;
let mut max_height = 0;
let mut branch_heights = vec![];
let mut root_branch_count = 0;
let mut max_branch_count = 0;
let mut branch_counts = vec![];
const ROOT_ID: usize = 0;
let mut queue = VecDeque::new();
queue.push_back(ROOT_ID);
while !queue.is_empty() {
let layer_nodes_count = queue.len();
max_height += 1;
for _ in 0..layer_nodes_count {
let id = queue.pop_front().unwrap();
let node = &self.arena[id];
node_count += 1;
if node.syllable.is_some() || id == ROOT_ID {
if node.leaf_id.is_some() {
leaf_count += 1;
branch_heights.push(max_height);
}
let branch_count = node
.children
.iter()
.filter(|&&child_id| self.arena[child_id].syllable.is_some())
.count();
branch_counts.push(branch_count);
if branch_count > max_branch_count && id != ROOT_ID {
max_branch_count = node.children.len();
}
if branch_count > root_branch_count {
root_branch_count = node.children.len();
}
} else {
phrase_count += node.phrases.len();
}
if let Some(leaf_id) = node.leaf_id {
queue.push_back(leaf_id.get());
}
for &child_id in &node.children {
queue.push_back(child_id);
}
}
}
TrieStatistics {
node_count,
leaf_count,
phrase_count,
max_height,
avg_height: branch_heights.iter().sum::<usize>() / branch_counts.len(),
root_branch_count,
max_branch_count,
avg_branch_count: branch_counts.iter().sum::<usize>() / branch_counts.len(),
}
}
}
impl DictionaryBuilder for TrieBuilder {
fn set_info(&mut self, info: DictionaryInfo) -> Result<(), BuildDictionaryError> {
self.info = info;
Ok(())
}
fn insert(
&mut self,
syllables: &[Syllable],
phrase: Phrase,
) -> Result<(), BuildDictionaryError> {
let leaf_id = self.find_or_insert_internal(syllables);
if let Some(it) = self.arena[leaf_id]
.phrases
.iter_mut()
.find(|it| it.as_str() == phrase.as_str())
{
*it = phrase;
} else {
self.arena[leaf_id].phrases.push(phrase);
}
Ok(())
}
fn build(&mut self, path: &Path) -> Result<(), BuildDictionaryError> {
let err = || BuildDictionaryError::new("failed to finalize dictionary");
let mut tmpname = path.to_path_buf();
let pseudo_random = rand();
tmpname.set_file_name(format!("chewing-{pseudo_random}.dat"));
let database = File::create(&tmpname).or_raise(err)?;
let mut writer = BufWriter::new(&database);
self.write(&mut writer).or_raise(err)?;
writer.flush().or_raise(err)?;
database.sync_data().or_raise(err)?;
debug!("rename from {} to {}", tmpname.display(), path.display());
fs::rename(&tmpname, path).or_raise(err)?;
Ok(())
}
}
fn rand() -> u64 {
use std::collections::hash_map::RandomState;
use std::hash::BuildHasher;
use std::hash::Hasher;
RandomState::new().build_hasher().finish()
}
impl Default for TrieBuilder {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use std::{
io::{Cursor, Seek},
num::NonZeroUsize,
};
use super::{Trie, TrieBuilder};
use crate::{
dictionary::{
Dictionary, DictionaryBuilder, DictionaryInfo, DictionaryUsage, LookupStrategy, Phrase,
TrieOpenOptions, trie::TrieBuilderNode,
},
syl,
zhuyin::Bopomofo,
};
#[test]
fn test_tree_construction() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 100).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::S, Bopomofo::U, Bopomofo::O, Bopomofo::TONE3],
],
("廁所", 100).into(),
)?;
assert_eq!(
vec![
TrieBuilderNode {
id: 0,
syllable: None,
children: vec![1],
leaf_id: None,
phrases: vec![]
},
TrieBuilderNode {
id: 1,
syllable: Some(syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]),
children: vec![2, 4],
leaf_id: None,
phrases: vec![]
},
TrieBuilderNode {
id: 2,
syllable: Some(syl![Bopomofo::SH, Bopomofo::TONE4]),
children: vec![],
leaf_id: NonZeroUsize::new(3),
phrases: vec![]
},
TrieBuilderNode {
id: 3,
syllable: None,
children: vec![],
leaf_id: None,
phrases: vec![("測試", 100).into()]
},
TrieBuilderNode {
id: 4,
syllable: Some(syl![Bopomofo::S, Bopomofo::U, Bopomofo::O, Bopomofo::TONE3]),
children: vec![],
leaf_id: NonZeroUsize::new(5),
phrases: vec![]
},
TrieBuilderNode {
id: 5,
syllable: None,
children: vec![],
leaf_id: None,
phrases: vec![("廁所", 100).into()]
}
],
builder.arena
);
Ok(())
}
#[test]
fn tree_lookup_word() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
("測", 1).into(),
)?;
builder.insert(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
("冊", 1).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
assert_eq!(
vec![Phrase::new("測", 1), Phrase::new("冊", 1)],
dict.lookup(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
LookupStrategy::Standard
)
);
Ok(())
}
#[test]
fn tree_lookup_word_fuzzy() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
("測", 1).into(),
)?;
builder.insert(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
("冊", 1).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = TrieOpenOptions::new().read_from(&mut cursor)?;
assert_eq!(
vec![Phrase::new("測", 1), Phrase::new("冊", 1)],
dict.lookup(
&[syl![Bopomofo::C, Bopomofo::E]],
LookupStrategy::FuzzyPartialPrefix
)
);
assert_eq!(
vec![Phrase::new("測", 1), Phrase::new("冊", 1)],
dict.lookup(&[syl![Bopomofo::C]], LookupStrategy::FuzzyPartialPrefix)
);
Ok(())
}
#[test]
fn tree_lookup_phrase() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 1).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策試", 2).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
("測試成功", 3).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
assert_eq!(
vec![Phrase::new("策試", 2), Phrase::new("測試", 1)],
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4]
],
LookupStrategy::Standard
)
);
assert_eq!(
vec![Phrase::new("測試成功", 3)],
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
LookupStrategy::Standard
)
);
assert_eq!(
Vec::<Phrase>::new(),
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::U, Bopomofo::O, Bopomofo::TONE4],
syl![Bopomofo::U, Bopomofo::TONE4]
],
LookupStrategy::Standard
)
);
Ok(())
}
#[test]
fn tree_lookup_phrase_fuzzy() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 1).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策試", 2).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
("測試成功", 3).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = TrieOpenOptions::new()
.fuzzy_search(true)
.read_from(&mut cursor)?;
assert_eq!(
vec![Phrase::new("策試", 2), Phrase::new("測試", 1)],
dict.lookup(
&[syl![Bopomofo::C, Bopomofo::E], syl![Bopomofo::SH]],
LookupStrategy::FuzzyPartialPrefix
)
);
assert_eq!(
vec![Phrase::new("策試", 2), Phrase::new("測試", 1)],
dict.lookup(
&[syl![Bopomofo::C], syl![Bopomofo::SH]],
LookupStrategy::FuzzyPartialPrefix
)
);
assert_eq!(
vec![Phrase::new("測試成功", 3)],
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::E],
syl![Bopomofo::SH],
syl![Bopomofo::CH, Bopomofo::ENG],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
LookupStrategy::FuzzyPartialPrefix
)
);
assert_eq!(
vec![Phrase::new("測試成功", 3)],
dict.lookup(
&[
syl![Bopomofo::C],
syl![Bopomofo::SH],
syl![Bopomofo::CH],
syl![Bopomofo::G],
],
LookupStrategy::FuzzyPartialPrefix
)
);
assert_eq!(
Vec::<Phrase>::new(),
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::U, Bopomofo::O, Bopomofo::TONE4],
syl![Bopomofo::U, Bopomofo::TONE4]
],
LookupStrategy::FuzzyPartialPrefix
)
);
Ok(())
}
#[test]
fn tree_builder_duplicate_phrase() {
let mut builder = TrieBuilder::new();
builder
.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 1).into(),
)
.expect("no error");
builder
.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 2).into(),
)
.expect("no error");
}
#[test]
fn stable_word_sort_order() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
for word in ["冊", "策", "測", "側"] {
builder.insert(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4]],
(word, 0).into(),
)?;
}
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
assert_eq!(
vec![
Phrase::new("冊", 0),
Phrase::new("策", 0),
Phrase::new("測", 0),
Phrase::new("側", 0),
],
dict.lookup(
&[syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],],
LookupStrategy::Standard
)
);
Ok(())
}
#[test]
fn stable_phrase_sort_order() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("側室", 318).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("側視", 318).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策士", 318).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策試", 318).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 9318).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
assert_eq!(
vec![
Phrase::new("測試", 9318),
Phrase::new("策試", 318),
Phrase::new("策士", 318),
Phrase::new("側視", 318),
Phrase::new("側室", 318),
],
dict.lookup(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
LookupStrategy::Standard
)
);
Ok(())
}
#[test]
fn tree_builder_write_read_metadata() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
let info = DictionaryInfo {
name: "name".into(),
copyright: "copyright".into(),
license: "license".into(),
version: "version".into(),
software: "software".into(),
usage: DictionaryUsage::BuiltIn,
};
builder.set_info(info)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
let info = dict.about();
assert_eq!("name", info.name);
assert_eq!("copyright", info.copyright);
assert_eq!("license", info.license);
assert_eq!("version", info.version);
assert_eq!("software", info.software);
Ok(())
}
#[test]
fn tree_entries() -> Result<(), Box<dyn std::error::Error>> {
let mut builder = TrieBuilder::new();
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("測試", 1).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
],
("策試", 2).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
("測試成功", 3).into(),
)?;
builder.insert(
&[
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::SH],
syl![Bopomofo::B, Bopomofo::AI, Bopomofo::TONE4],
],
("測試失敗", 3).into(),
)?;
let mut cursor = Cursor::new(vec![]);
builder.write(&mut cursor)?;
cursor.rewind()?;
let dict = Trie::new(&mut cursor)?;
assert_eq!(
vec![
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::CH, Bopomofo::ENG, Bopomofo::TONE2],
syl![Bopomofo::G, Bopomofo::U, Bopomofo::ENG],
],
Phrase::new("測試成功", 3)
),
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4]
],
Phrase::new("策試", 2)
),
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4]
],
Phrase::new("測試", 1)
),
(
vec![
syl![Bopomofo::C, Bopomofo::E, Bopomofo::TONE4],
syl![Bopomofo::SH, Bopomofo::TONE4],
syl![Bopomofo::SH],
syl![Bopomofo::B, Bopomofo::AI, Bopomofo::TONE4],
],
Phrase::new("測試失敗", 3)
),
],
dict.entries().collect::<Vec<_>>()
);
Ok(())
}
}