crawdad_rkyv/lib.rs
1//! 🦞 Crawdad: ChaRActer-Wise Double-Array Dictionary
2//!
3//! Crawdad is a library of natural language dictionaries using character-wise double-array tries.
4//! The implementation is optimized for strings of multibyte-characters,
5//! and you can enjoy fast text processing on such strings such as Japanese or Chinese.
6//!
7//! # Data structures
8//!
9//! Crawdad contains the two trie implementations:
10//!
11//! - [`Trie`] is a standard trie form that often provides the fastest queries.
12//! - [`MpTrie`] is a minimal-prefix trie form that is memory-efficient for long strings.
13//!
14//! # Examples
15//!
16//! ## Looking up an input key
17//!
18//! To get a value associated with an input key, use [`Trie::exact_match()`].
19//!
20//! ```
21//! use crawdad_rkyv::Trie;
22//!
23//! let keys = vec!["世界", "世界中", "国民"];
24//! let trie = Trie::from_keys(&keys).unwrap();
25//!
26//! assert_eq!(trie.exact_match("世界中".chars()), Some(1));
27//! assert_eq!(trie.exact_match("日本中".chars()), None);
28//! ```
29//!
30//! ## Finding all occurrences of keys in an input text
31//!
32//! To search for all occurrences of registered keys in an input text,
33//! use [`Trie::common_prefix_search()`] for all starting positions in the text.
34//!
35//! ```
36//! use crawdad_rkyv::Trie;
37//!
38//! let keys = vec!["世界", "世界中", "国民"];
39//! let trie = Trie::from_keys(&keys).unwrap();
40//!
41//! let haystack: Vec<char> = "国民が世界中にて".chars().collect();
42//! let mut matches = vec![];
43//!
44//! for i in 0..haystack.len() {
45//! for (v, j) in trie.common_prefix_search(haystack[i..].iter().copied()) {
46//! matches.push((v, i..i + j));
47//! }
48//! }
49//!
50//! assert_eq!(
51//! matches,
52//! vec![(2, 0..2), (0, 3..5), (1, 3..6)]
53//! );
54//! ```
55//!
56//! ## Serializing and deserializing the data structure
57//!
58//! To serialize/deserialize the data structure into/from a byte sequence,
59//! use [`Trie::serialize_to_vec()`]/[`Trie::deserialize_from_slice()`].
60//!
61//! ```
62//! use crawdad_rkyv::Trie;
63//!
64//! let keys = vec!["世界", "世界中", "国民"];
65//! let trie = Trie::from_keys(&keys).unwrap();
66//!
67//! let bytes = trie.serialize_to_vec();
68//! let (other, _) = Trie::deserialize_from_slice(&bytes);
69//!
70//! assert_eq!(trie.io_bytes(), other.io_bytes());
71//! ```
72#![deny(missing_docs)]
73#![no_std]
74
75#[cfg(target_pointer_width = "16")]
76compile_error!("`target_pointer_width` must be larger than or equal to 32");
77
78#[cfg(not(feature = "alloc"))]
79compile_error!("`alloc` feature is currently required to build this crate");
80
81#[macro_use]
82extern crate alloc;
83
84mod builder;
85pub mod errors;
86mod mapper;
87pub mod mptrie;
88pub mod trie;
89mod utils;
90
91pub(crate) const OFFSET_MASK: u32 = 0x7fff_ffff;
92pub(crate) const INVALID_IDX: u32 = 0xffff_ffff;
93pub(crate) const MAX_VALUE: u32 = OFFSET_MASK;
94pub(crate) const END_CODE: u32 = 0;
95
96/// Special terminator, which must not be contained in keys.
97pub const END_MARKER: char = '\u{ffff}';
98
99pub use mptrie::MpTrie;
100use rkyv::{Archive, Deserialize, Serialize};
101pub use trie::Trie;
102
103#[derive(Default, Clone, Copy, Debug, PartialEq, Eq, Archive, Serialize, Deserialize)]
104struct Node {
105 base: u32,
106 check: u32,
107}
108
109impl Node {
110 #[inline(always)]
111 pub const fn get_base(&self) -> u32 {
112 self.base & OFFSET_MASK
113 }
114
115 #[inline(always)]
116 pub const fn get_check(&self) -> u32 {
117 self.check & OFFSET_MASK
118 }
119
120 #[inline(always)]
121 pub const fn is_leaf(&self) -> bool {
122 self.base & !OFFSET_MASK != 0
123 }
124
125 #[inline(always)]
126 pub const fn has_leaf(&self) -> bool {
127 self.check & !OFFSET_MASK != 0
128 }
129
130 #[inline(always)]
131 pub const fn is_vacant(&self) -> bool {
132 self.base == OFFSET_MASK && self.check == OFFSET_MASK
133 }
134
135 pub const fn io_bytes() -> usize {
136 8
137 }
138
139 #[inline(always)]
140 fn serialize(&self) -> [u8; 8] {
141 let mut bytes = [0; 8];
142 bytes[0..4].copy_from_slice(&self.base.to_le_bytes());
143 bytes[4..8].copy_from_slice(&self.check.to_le_bytes());
144 bytes
145 }
146
147 #[inline(always)]
148 fn deserialize(bytes: [u8; 8]) -> Self {
149 Self {
150 base: u32::from_le_bytes(bytes[0..4].try_into().unwrap()),
151 check: u32::from_le_bytes(bytes[4..8].try_into().unwrap()),
152 }
153 }
154}
155
156impl ArchivedNode {
157 #[inline(always)]
158 pub fn get_base(&self) -> u32 {
159 self.base & OFFSET_MASK
160 }
161
162 #[inline(always)]
163 pub fn get_check(&self) -> u32 {
164 self.check & OFFSET_MASK
165 }
166
167 #[inline(always)]
168 pub fn is_leaf(&self) -> bool {
169 self.base & !OFFSET_MASK != 0
170 }
171
172 #[inline(always)]
173 pub fn has_leaf(&self) -> bool {
174 self.check & !OFFSET_MASK != 0
175 }
176
177 #[inline(always)]
178 pub fn is_vacant(&self) -> bool {
179 self.base == OFFSET_MASK && self.check == OFFSET_MASK
180 }
181}