crawdad_rkyv/
lib.rs

1//! 🦞 Crawdad: ChaRActer-Wise Double-Array Dictionary
2//!
3//! Crawdad is a library of natural language dictionaries using character-wise double-array tries.
4//! The implementation is optimized for strings of multibyte-characters,
5//! and you can enjoy fast text processing on such strings such as Japanese or Chinese.
6//!
7//! # Data structures
8//!
9//! Crawdad contains the two trie implementations:
10//!
11//! - [`Trie`] is a standard trie form that often provides the fastest queries.
12//! - [`MpTrie`] is a minimal-prefix trie form that is memory-efficient for long strings.
13//!
14//! # Examples
15//!
16//! ## Looking up an input key
17//!
18//! To get a value associated with an input key, use [`Trie::exact_match()`].
19//!
20//! ```
21//! use crawdad_rkyv::Trie;
22//!
23//! let keys = vec!["世界", "世界中", "国民"];
24//! let trie = Trie::from_keys(&keys).unwrap();
25//!
26//! assert_eq!(trie.exact_match("世界中".chars()), Some(1));
27//! assert_eq!(trie.exact_match("日本中".chars()), None);
28//! ```
29//!
30//! ## Finding all occurrences of keys in an input text
31//!
32//! To search for all occurrences of registered keys in an input text,
33//! use [`Trie::common_prefix_search()`] for all starting positions in the text.
34//!
35//! ```
36//! use crawdad_rkyv::Trie;
37//!
38//! let keys = vec!["世界", "世界中", "国民"];
39//! let trie = Trie::from_keys(&keys).unwrap();
40//!
41//! let haystack: Vec<char> = "国民が世界中にて".chars().collect();
42//! let mut matches = vec![];
43//!
44//! for i in 0..haystack.len() {
45//!     for (v, j) in trie.common_prefix_search(haystack[i..].iter().copied()) {
46//!         matches.push((v, i..i + j));
47//!     }
48//! }
49//!
50//! assert_eq!(
51//!     matches,
52//!     vec![(2, 0..2), (0, 3..5), (1, 3..6)]
53//! );
54//! ```
55//!
56//! ## Serializing and deserializing the data structure
57//!
58//! To serialize/deserialize the data structure into/from a byte sequence,
59//! use [`Trie::serialize_to_vec()`]/[`Trie::deserialize_from_slice()`].
60//!
61//! ```
62//! use crawdad_rkyv::Trie;
63//!
64//! let keys = vec!["世界", "世界中", "国民"];
65//! let trie = Trie::from_keys(&keys).unwrap();
66//!
67//! let bytes = trie.serialize_to_vec();
68//! let (other, _) = Trie::deserialize_from_slice(&bytes);
69//!
70//! assert_eq!(trie.io_bytes(), other.io_bytes());
71//! ```
72#![deny(missing_docs)]
73#![no_std]
74
75#[cfg(target_pointer_width = "16")]
76compile_error!("`target_pointer_width` must be larger than or equal to 32");
77
78#[cfg(not(feature = "alloc"))]
79compile_error!("`alloc` feature is currently required to build this crate");
80
81#[macro_use]
82extern crate alloc;
83
84mod builder;
85pub mod errors;
86mod mapper;
87pub mod mptrie;
88pub mod trie;
89mod utils;
90
91pub(crate) const OFFSET_MASK: u32 = 0x7fff_ffff;
92pub(crate) const INVALID_IDX: u32 = 0xffff_ffff;
93pub(crate) const MAX_VALUE: u32 = OFFSET_MASK;
94pub(crate) const END_CODE: u32 = 0;
95
96/// Special terminator, which must not be contained in keys.
97pub const END_MARKER: char = '\u{ffff}';
98
99pub use mptrie::MpTrie;
100use rkyv::{Archive, Deserialize, Serialize};
101pub use trie::Trie;
102
103#[derive(Default, Clone, Copy, Debug, PartialEq, Eq, Archive, Serialize, Deserialize)]
104struct Node {
105    base: u32,
106    check: u32,
107}
108
109impl Node {
110    #[inline(always)]
111    pub const fn get_base(&self) -> u32 {
112        self.base & OFFSET_MASK
113    }
114
115    #[inline(always)]
116    pub const fn get_check(&self) -> u32 {
117        self.check & OFFSET_MASK
118    }
119
120    #[inline(always)]
121    pub const fn is_leaf(&self) -> bool {
122        self.base & !OFFSET_MASK != 0
123    }
124
125    #[inline(always)]
126    pub const fn has_leaf(&self) -> bool {
127        self.check & !OFFSET_MASK != 0
128    }
129
130    #[inline(always)]
131    pub const fn is_vacant(&self) -> bool {
132        self.base == OFFSET_MASK && self.check == OFFSET_MASK
133    }
134
135    pub const fn io_bytes() -> usize {
136        8
137    }
138
139    #[inline(always)]
140    fn serialize(&self) -> [u8; 8] {
141        let mut bytes = [0; 8];
142        bytes[0..4].copy_from_slice(&self.base.to_le_bytes());
143        bytes[4..8].copy_from_slice(&self.check.to_le_bytes());
144        bytes
145    }
146
147    #[inline(always)]
148    fn deserialize(bytes: [u8; 8]) -> Self {
149        Self {
150            base: u32::from_le_bytes(bytes[0..4].try_into().unwrap()),
151            check: u32::from_le_bytes(bytes[4..8].try_into().unwrap()),
152        }
153    }
154}
155
156impl ArchivedNode {
157    #[inline(always)]
158    pub fn get_base(&self) -> u32 {
159        self.base & OFFSET_MASK
160    }
161
162    #[inline(always)]
163    pub fn get_check(&self) -> u32 {
164        self.check & OFFSET_MASK
165    }
166
167    #[inline(always)]
168    pub fn is_leaf(&self) -> bool {
169        self.base & !OFFSET_MASK != 0
170    }
171
172    #[inline(always)]
173    pub fn has_leaf(&self) -> bool {
174        self.check & !OFFSET_MASK != 0
175    }
176
177    #[inline(always)]
178    pub fn is_vacant(&self) -> bool {
179        self.base == OFFSET_MASK && self.check == OFFSET_MASK
180    }
181}