Skip to main content

gukhanmun_cdb/
lib.rs

1// Gukhanmun: CDB dictionary backend for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! CDB dictionary backend for Gukhanmun.
18
19#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::collections::BTreeMap;
23use std::ops::Deref;
24use std::path::Path;
25use std::sync::Arc;
26
27use ciborium::de::from_reader;
28use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
29
30const META_KEY: &[u8] = b"__gukhanmun_meta__";
31const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
32const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
33
34/// The CDB header is 2048 bytes: 256 slots × (u32le pos, u32le count).
35const HEADER_SIZE: usize = 2048;
36
37/// Backing byte store for a [`CdbDictionary`]: either a reference-counted
38/// owned allocation (from [`CdbDictionary::from_bytes`] /
39/// [`CdbDictionary::open`]) or a zero-cost borrow of static memory (from
40/// [`CdbDictionary::from_static_bytes`]).
41enum DataSource {
42    Owned(Arc<[u8]>),
43    Static(&'static [u8]),
44}
45
46impl Deref for DataSource {
47    type Target = [u8];
48
49    fn deref(&self) -> &[u8] {
50        match self {
51            DataSource::Owned(arc) => arc,
52            DataSource::Static(s) => s,
53        }
54    }
55}
56
57/// Dictionary backed by a Gukhanmun CDB-trie file.
58///
59/// The CDB bytes are held in an internal backing store that is either an
60/// [`Arc<[u8]>`]-backed owned allocation or a zero-copy borrow of a
61/// `'static` slice, depending on how the dictionary was loaded.
62pub struct CdbDictionary {
63    metadata: BTreeMap<String, String>,
64    data: DataSource,
65    entry_count: u64,
66    max_word_chars: Option<usize>,
67}
68
69impl CdbDictionary {
70    /// Opens a dictionary file from disk.
71    pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
72        let path = path.as_ref();
73        let bytes = std::fs::read(path).map_err(|source| Error::Open {
74            path: path.display().to_string(),
75            source,
76        })?;
77        Self::from_source(DataSource::Owned(Arc::from(bytes.as_slice())))
78    }
79
80    /// Decodes a dictionary from bytes in the Gukhanmun CDB-trie format.
81    pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
82        Self::from_source(DataSource::Owned(Arc::from(bytes)))
83    }
84
85    /// Decodes a dictionary from static bytes in the Gukhanmun CDB-trie
86    /// format.
87    ///
88    /// This is intended for embedded dictionaries built with
89    /// `include_bytes!`.  The CDB data is referenced directly without
90    /// copying.
91    pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
92        Self::from_source(DataSource::Static(bytes))
93    }
94
95    fn from_source(data: DataSource) -> Result<Self, Error> {
96        let metadata_bytes = cdb_get(&data, META_KEY)?.ok_or(Error::MissingRecord {
97            record: "dictionary metadata",
98        })?;
99        let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes.as_slice())
100            .map_err(|source| Error::MetadataDecode { source })?;
101        if let Some(version) = metadata.get("version")
102            && version != "1"
103        {
104            tracing::error!(version = %version, expected = "1", "unsupported CDB format version");
105            return Err(Error::UnsupportedVersion {
106                version: version.clone(),
107            });
108        }
109        let entry_count = parse_u64_metadata(&metadata, "entry_count").unwrap_or(0);
110        let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars");
111
112        tracing::info!(
113            format_version = metadata.get("version").map(String::as_str).unwrap_or("1"),
114            entry_count,
115            "loaded CDB dictionary"
116        );
117        Ok(Self {
118            metadata,
119            data,
120            entry_count,
121            max_word_chars,
122        })
123    }
124
125    /// Returns build metadata embedded in the dictionary file.
126    pub fn metadata(&self) -> &BTreeMap<String, String> {
127        &self.metadata
128    }
129
130    /// Returns the number of complete dictionary entries recorded at build
131    /// time.
132    pub fn entry_count(&self) -> u64 {
133        self.entry_count
134    }
135
136    /// Returns the exact dictionary entry for `hanja`, if present.
137    pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
138        let Some(value) = cdb_get(&self.data, hanja.as_bytes())? else {
139            return Ok(None);
140        };
141        let Some(record) = decode_record(&value)? else {
142            return Ok(None);
143        };
144        Ok(Some(record))
145    }
146}
147
148impl HanjaDictionary for CdbDictionary {
149    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
150        let max_word_chars = self.max_word_chars.unwrap_or(usize::MAX);
151        let mut matches = Vec::new();
152        let mut prefix = String::new();
153
154        for (index, ch) in s.chars().enumerate() {
155            if index >= max_word_chars {
156                break;
157            }
158            prefix.push(ch);
159            let value = match cdb_get(&self.data, prefix.as_bytes()) {
160                Ok(Some(value)) => value,
161                Ok(None) => break,
162                Err(error) => {
163                    tracing::warn!(
164                        prefix_len = prefix.len(),
165                        error = ?error,
166                        "aborting CDB prefix traversal due to read error"
167                    );
168                    break;
169                }
170            };
171            match decode_record(&value) {
172                Ok(Some(entry)) => {
173                    matches.push(Match {
174                        byte_len: prefix.len(),
175                        reading: entry.reading,
176                        suffix_reading: None,
177                        mark: entry.mark,
178                    });
179                }
180                Ok(None) => {}
181                Err(error) => {
182                    tracing::warn!(
183                        prefix_len = prefix.len(),
184                        error = ?error,
185                        "aborting CDB prefix traversal due to decode error"
186                    );
187                    break;
188                }
189            }
190        }
191
192        Box::new(matches.into_iter())
193    }
194
195    fn max_word_chars(&self) -> Option<usize> {
196        self.max_word_chars
197    }
198
199    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
200        let mut records = Vec::new();
201        for result in cdb_iter(&self.data) {
202            let (key, value) = match result {
203                Ok(pair) => pair,
204                Err(error) => {
205                    tracing::warn!(error = ?error, "skipping CDB entry due to iterator error");
206                    continue;
207                }
208            };
209            if key == META_KEY {
210                continue;
211            }
212            let entry = match decode_record(&value) {
213                Ok(Some(entry)) => entry,
214                Ok(None) => continue,
215                Err(error) => {
216                    tracing::warn!(error = ?error, "skipping malformed CDB entry");
217                    continue;
218                }
219            };
220            let Ok(hanja) = String::from_utf8(key) else {
221                tracing::warn!("skipping CDB entry with non-UTF-8 key");
222                continue;
223            };
224            records.push(DictionaryRecord {
225                hanja,
226                reading: entry.reading,
227                mark: entry.mark,
228            });
229        }
230        Some(Box::new(records.into_iter()))
231    }
232
233    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
234        for result in cdb_iter(&self.data) {
235            let (key, value) = match result {
236                Ok(pair) => pair,
237                Err(_) => continue,
238            };
239            if key == META_KEY || key == hanja.as_bytes() {
240                continue;
241            }
242            if decode_record(&value).is_ok_and(|entry| entry.is_some_and(|e| e.reading == reading))
243            {
244                return true;
245            }
246        }
247        false
248    }
249}
250
251/// A decoded exact-match dictionary entry.
252#[derive(Clone, Debug, Eq, PartialEq)]
253pub struct LookupEntry {
254    reading: String,
255    mark: MatchMark,
256}
257
258impl LookupEntry {
259    /// Returns the hangul reading for the entry.
260    pub fn reading(&self) -> &str {
261        &self.reading
262    }
263
264    /// Returns dictionary-provided rendering constraints.
265    pub fn mark(&self) -> MatchMark {
266        self.mark
267    }
268}
269
270/// Error returned while opening or decoding a CDB dictionary.
271#[derive(Debug, thiserror::Error)]
272#[non_exhaustive]
273pub enum Error {
274    /// Opening a CDB dictionary file failed.
275    #[error("failed to open {path}: {source}")]
276    Open {
277        /// Path that failed to open.
278        path: String,
279        /// Underlying I/O error.
280        #[source]
281        source: std::io::Error,
282    },
283
284    /// A required record is missing.
285    #[error("missing {record}")]
286    MissingRecord {
287        /// Human-readable record name.
288        record: &'static str,
289    },
290
291    /// CBOR metadata could not be decoded.
292    #[error("failed to decode dictionary metadata: {source}")]
293    MetadataDecode {
294        /// Underlying CBOR decode error.
295        #[source]
296        source: ciborium::de::Error<std::io::Error>,
297    },
298
299    /// The metadata version is not supported.
300    #[error("unsupported dictionary version {version}")]
301    UnsupportedVersion {
302        /// Version string read from metadata.
303        version: String,
304    },
305
306    /// A CDB value did not match the expected record layout.
307    #[error("malformed CDB record: {reason}")]
308    MalformedRecord {
309        /// Description of the malformed record condition.
310        reason: &'static str,
311    },
312
313    /// A CDB value range overflowed while decoding.
314    #[error("{field} overflow")]
315    ValueOverflow {
316        /// Field that overflowed.
317        field: &'static str,
318    },
319
320    /// A CDB value range points outside the record.
321    #[error("{field} is outside the CDB record")]
322    ValueOutOfBounds {
323        /// Field that was out of bounds.
324        field: &'static str,
325    },
326
327    /// A UTF-8 string field was invalid.
328    #[error("{field} contains invalid UTF-8: {source}")]
329    InvalidUtf8 {
330        /// Field that contained invalid UTF-8.
331        field: &'static str,
332        /// Underlying UTF-8 error.
333        #[source]
334        source: std::str::Utf8Error,
335    },
336
337    /// The CDB data is shorter than the required 2048-byte header.
338    #[error("CDB data is too short: {len} bytes")]
339    TooShort {
340        /// Actual byte length of the data.
341        len: usize,
342    },
343
344    /// A CDB header slot or record points outside the data buffer.
345    #[error("CDB offset {offset} is out of bounds (data len {len})")]
346    OutOfBounds {
347        /// The out-of-bounds offset.
348        offset: usize,
349        /// The data length.
350        len: usize,
351    },
352}
353
354// ── Pure CDB operations ────────────────────────────────────────────────────
355
356/// DJB2 hash used by the CDB format (Daniel J. Bernstein's hash).
357fn cdb_hash(key: &[u8]) -> u32 {
358    key.iter().fold(5381u32, |h, &b| {
359        h.wrapping_shl(5).wrapping_add(h) ^ (b as u32)
360    })
361}
362
363/// Read a little-endian `u32` at `offset` from `data`, returning `None` if
364/// out of bounds.
365fn read_u32(data: &[u8], offset: usize) -> Option<u32> {
366    data.get(offset..offset + 4)
367        .and_then(|b| b.try_into().ok())
368        .map(u32::from_le_bytes)
369}
370
371/// Look up `key` in the CDB data buffer.  Returns `Ok(None)` when the key is
372/// absent, `Ok(Some(value_bytes))` on a match, and `Err` on format errors.
373fn cdb_get(data: &[u8], key: &[u8]) -> Result<Option<Vec<u8>>, Error> {
374    if data.len() < HEADER_SIZE {
375        return Err(Error::TooShort { len: data.len() });
376    }
377
378    let h = cdb_hash(key);
379    let header_slot = (h & 0xff) as usize;
380    let header_base = header_slot * 8;
381
382    let table_pos = read_u32(data, header_base).ok_or(Error::OutOfBounds {
383        offset: header_base,
384        len: data.len(),
385    })? as usize;
386    let table_count = read_u32(data, header_base + 4).ok_or(Error::OutOfBounds {
387        offset: header_base + 4,
388        len: data.len(),
389    })? as usize;
390
391    if table_count == 0 {
392        return Ok(None);
393    }
394
395    let start_slot = ((h >> 8) as usize) % table_count;
396
397    for i in 0..table_count {
398        let slot = (start_slot + i) % table_count;
399        let slot_offset = table_pos + slot * 8;
400
401        let slot_hash = match read_u32(data, slot_offset) {
402            Some(v) => v,
403            None => return Ok(None),
404        };
405        let data_pos = match read_u32(data, slot_offset + 4) {
406            Some(v) => v as usize,
407            None => return Ok(None),
408        };
409
410        if data_pos == 0 {
411            return Ok(None);
412        }
413
414        if slot_hash == h {
415            let key_len = match read_u32(data, data_pos) {
416                Some(v) => v as usize,
417                None => continue,
418            };
419            let val_len = match read_u32(data, data_pos + 4) {
420                Some(v) => v as usize,
421                None => continue,
422            };
423            let key_start = data_pos + 8;
424            let val_start = key_start.saturating_add(key_len);
425            let val_end = val_start.saturating_add(val_len);
426
427            if val_end > data.len() {
428                continue;
429            }
430            if data[key_start..key_start + key_len] == *key {
431                return Ok(Some(data[val_start..val_end].to_vec()));
432            }
433        }
434    }
435
436    Ok(None)
437}
438
439/// Iterate all records in the CDB data area (bytes 2048 up to the first
440/// hash table).  Yields `(key_bytes, value_bytes)` pairs.
441fn cdb_iter(data: &[u8]) -> impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), Error>> + '_ {
442    // The data area ends where the first hash table begins.
443    let data_end = (0..256usize)
444        .filter_map(|i| {
445            let pos = read_u32(data, i * 8)? as usize;
446            if pos >= HEADER_SIZE { Some(pos) } else { None }
447        })
448        .min()
449        .unwrap_or(data.len());
450
451    CdbIter {
452        data,
453        pos: HEADER_SIZE,
454        data_end,
455    }
456}
457
458struct CdbIter<'a> {
459    data: &'a [u8],
460    pos: usize,
461    data_end: usize,
462}
463
464impl<'a> Iterator for CdbIter<'a> {
465    type Item = Result<(Vec<u8>, Vec<u8>), Error>;
466
467    fn next(&mut self) -> Option<Self::Item> {
468        if self.pos >= self.data_end {
469            return None;
470        }
471
472        let key_len = read_u32(self.data, self.pos)? as usize;
473        let val_len = read_u32(self.data, self.pos + 4)? as usize;
474        let key_start = self.pos + 8;
475        let val_start = key_start + key_len;
476        let next_pos = val_start + val_len;
477
478        if next_pos > self.data_end {
479            self.pos = self.data_end;
480            return Some(Err(Error::OutOfBounds {
481                offset: next_pos,
482                len: self.data_end,
483            }));
484        }
485
486        let key = self.data[key_start..key_start + key_len].to_vec();
487        let val = self.data[val_start..val_start + val_len].to_vec();
488        self.pos = next_pos;
489        Some(Ok((key, val)))
490    }
491}
492
493// ── Record helpers ─────────────────────────────────────────────────────────
494
495fn decode_record(value: &[u8]) -> Result<Option<LookupEntry>, Error> {
496    if value.len() < 4 {
497        return Err(Error::MalformedRecord {
498            reason: "record is shorter than the fixed prefix",
499        });
500    }
501    if value[0] == 0 {
502        return Ok(None);
503    }
504
505    let mark = decode_mark(value[1]);
506    let reading_len = u16::from_le_bytes([value[2], value[3]]) as usize;
507    let reading_end = 4usize
508        .checked_add(reading_len)
509        .ok_or(Error::ValueOverflow {
510            field: "reading range",
511        })?;
512    let reading_bytes = value
513        .get(4..reading_end)
514        .ok_or(Error::ValueOutOfBounds { field: "reading" })?;
515    let reading = std::str::from_utf8(reading_bytes)
516        .map_err(|source| Error::InvalidUtf8 {
517            field: "reading",
518            source,
519        })?
520        .to_owned();
521
522    Ok(Some(LookupEntry { reading, mark }))
523}
524
525fn decode_mark(encoded: u8) -> MatchMark {
526    MatchMark {
527        require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
528        require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
529    }
530}
531
532fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
533    metadata.get(key).and_then(|value| value.parse().ok())
534}
535
536fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
537    metadata.get(key).and_then(|value| value.parse().ok())
538}
539
540// ── Tests ──────────────────────────────────────────────────────────────────
541
542#[cfg(test)]
543fn encode_record(entry: Option<(&str, MatchMark)>) -> Vec<u8> {
544    let mut output = Vec::new();
545    match entry {
546        Some((reading, mark)) => {
547            output.push(1);
548            output.push(encode_mark(mark));
549            output.extend_from_slice(&(reading.len() as u16).to_le_bytes());
550            output.extend_from_slice(reading.as_bytes());
551        }
552        None => {
553            output.push(0);
554            output.push(0);
555            output.extend_from_slice(&0u16.to_le_bytes());
556        }
557    }
558    output
559}
560
561#[cfg(test)]
562fn encode_mark(mark: MatchMark) -> u8 {
563    let mut encoded = 0;
564    if mark.require_hanja {
565        encoded |= MARK_REQUIRE_HANJA;
566    }
567    if mark.require_hangul {
568        encoded |= MARK_REQUIRE_HANGUL;
569    }
570    encoded
571}
572
573#[cfg(test)]
574mod tests {
575    use std::collections::BTreeMap;
576    use std::fs;
577    use std::path::Path;
578
579    use ciborium::ser::into_writer;
580    use gukhanmun_core::{HanjaDictionary, MapDictionary, MatchMark};
581    use proptest::prelude::*;
582    use tempfile::tempdir;
583    use tracing_test::traced_test;
584
585    use super::{CdbDictionary, META_KEY, encode_record};
586
587    #[traced_test]
588    #[test]
589    fn unsupported_version_emits_error_event() {
590        let temp = tempdir().unwrap();
591        let path = temp.path().join("dict.gukcdb");
592        let metadata = BTreeMap::from([("version".to_owned(), "99".to_owned())]);
593        let mut metadata_bytes = Vec::new();
594        into_writer(&metadata, &mut metadata_bytes).unwrap();
595        let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
596        writer.add(META_KEY, &metadata_bytes).unwrap();
597        writer.finish().unwrap();
598
599        let result = CdbDictionary::open(&path);
600
601        assert!(matches!(
602            result,
603            Err(super::Error::UnsupportedVersion { .. })
604        ));
605        assert!(logs_contain("unsupported CDB format version"));
606    }
607
608    #[test]
609    fn loads_metadata_lookup_and_prefix_matches() {
610        let temp = tempdir().unwrap();
611        let path = temp.path().join("dict.gukcdb");
612        write_fixture(
613            &path,
614            &[
615                entry("行事", "행사", false, false),
616                entry("行事場", "행사장", true, false),
617                entry("場所", "장소", false, true),
618            ],
619        );
620
621        let dictionary = CdbDictionary::open(&path).unwrap();
622
623        assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
624        assert_eq!(dictionary.entry_count(), 3);
625        assert_eq!(dictionary.max_word_chars(), Some(3));
626        let exact = dictionary.lookup("行事場").unwrap().unwrap();
627        assert_eq!(exact.reading(), "행사장");
628        assert!(exact.mark().require_hanja);
629        assert!(!exact.mark().require_hangul);
630        let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
631        assert_eq!(matches.len(), 2);
632        assert_eq!(matches[0].reading, "행사");
633        assert_eq!(matches[1].reading, "행사장");
634    }
635
636    #[test]
637    fn from_bytes_matches_open() {
638        let temp = tempdir().unwrap();
639        let path = temp.path().join("dict.gukcdb");
640        write_fixture(
641            &path,
642            &[
643                entry("行事", "행사", false, false),
644                entry("場所", "장소", false, false),
645            ],
646        );
647
648        let bytes = fs::read(&path).unwrap();
649        let from_bytes = CdbDictionary::from_bytes(&bytes).unwrap();
650        let from_open = CdbDictionary::open(&path).unwrap();
651
652        assert_eq!(from_bytes.metadata(), from_open.metadata());
653        assert_eq!(from_bytes.entry_count(), from_open.entry_count());
654        assert_eq!(from_bytes.max_word_chars(), from_open.max_word_chars());
655
656        let bytes_matches = from_bytes.matches_at("行事入口").collect::<Vec<_>>();
657        let open_matches = from_open.matches_at("行事入口").collect::<Vec<_>>();
658        assert_eq!(bytes_matches, open_matches);
659    }
660
661    #[test]
662    fn has_homophone_detects_other_forms_with_same_reading() {
663        let temp = tempdir().unwrap();
664        let path = temp.path().join("dict.gukcdb");
665        write_fixture(
666            &path,
667            &[
668                entry("漢字", "한자", false, false),
669                entry("翰字", "한자", false, false),
670                entry("天地", "천지", false, false),
671            ],
672        );
673        let dictionary = CdbDictionary::open(&path).unwrap();
674
675        assert!(dictionary.has_homophone("漢字", "한자"));
676        assert!(!dictionary.has_homophone("天地", "천지"));
677    }
678
679    #[test]
680    fn open_errors_preserve_structured_variants_and_sources() {
681        let temp = tempdir().unwrap();
682        let path = temp.path().join("dict.gukcdb");
683        let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
684        writer.add(META_KEY, &[0xff]).unwrap();
685        writer.finish().unwrap();
686
687        let error = match CdbDictionary::open(&path) {
688            Ok(_) => panic!("corrupt metadata should fail to open"),
689            Err(error) => error,
690        };
691
692        assert!(matches!(error, super::Error::MetadataDecode { .. }));
693        assert!(std::error::Error::source(&error).is_some());
694    }
695
696    #[test]
697    fn lookup_errors_distinguish_malformed_records() {
698        let temp = tempdir().unwrap();
699        let path = temp.path().join("dict.gukcdb");
700        let metadata = BTreeMap::from([
701            ("entry_count".to_owned(), "1".to_owned()),
702            ("version".to_owned(), "1".to_owned()),
703            ("max_word_chars".to_owned(), "2".to_owned()),
704        ]);
705        let mut metadata_bytes = Vec::new();
706        into_writer(&metadata, &mut metadata_bytes).unwrap();
707        let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
708        writer.add(META_KEY, &metadata_bytes).unwrap();
709        writer.add("天地".as_bytes(), &[1, 0, 1, 0, 0xff]).unwrap();
710        writer.finish().unwrap();
711        let dictionary = CdbDictionary::open(&path).unwrap();
712
713        let error = dictionary.lookup("天地").unwrap_err();
714
715        assert!(matches!(
716            error,
717            super::Error::InvalidUtf8 {
718                field: "reading",
719                ..
720            }
721        ));
722        assert!(std::error::Error::source(&error).is_some());
723    }
724
725    proptest! {
726        #[test]
727        fn generated_cdb_matches_map_dictionary(entries in unique_entries()) {
728            let temp = tempdir().unwrap();
729            let path = temp.path().join("dict.gukcdb");
730            let fixture_entries = entries
731                .iter()
732                .map(|(hanja, reading, require_hanja, require_hangul)| {
733                    TestEntry {
734                        hanja,
735                        reading,
736                        mark: MatchMark {
737                            require_hanja: *require_hanja,
738                            require_hangul: *require_hangul,
739                        },
740                    }
741                })
742                .collect::<Vec<_>>();
743            write_fixture(&path, &fixture_entries);
744            let cdb = CdbDictionary::open(&path).unwrap();
745            let mut map = MapDictionary::new();
746
747            for (hanja, reading, require_hanja, require_hangul) in entries {
748                map.insert_marked(
749                    &hanja,
750                    &reading,
751                    MatchMark {
752                        require_hanja,
753                        require_hangul,
754                    },
755                );
756                let cdb_matches = cdb.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
757                let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758                prop_assert_eq!(cdb_matches, map_matches);
759                let lookup = cdb.lookup(&hanja).unwrap().unwrap();
760                prop_assert_eq!(lookup.reading(), reading.as_str());
761            }
762        }
763
764        #[test]
765        fn from_bytes_matches_open_proptest(entries in unique_entries()) {
766            let temp = tempdir().unwrap();
767            let path = temp.path().join("dict.gukcdb");
768            let fixture_entries = entries
769                .iter()
770                .map(|(hanja, reading, require_hanja, require_hangul)| {
771                    TestEntry {
772                        hanja,
773                        reading,
774                        mark: MatchMark {
775                            require_hanja: *require_hanja,
776                            require_hangul: *require_hangul,
777                        },
778                    }
779                })
780                .collect::<Vec<_>>();
781            write_fixture(&path, &fixture_entries);
782            let bytes = fs::read(&path).unwrap();
783
784            let from_open = CdbDictionary::open(&path).unwrap();
785            let from_bytes = CdbDictionary::from_bytes(&bytes).unwrap();
786
787            for (hanja, ..) in &entries {
788                let open_matches = from_open.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
789                let bytes_matches = from_bytes.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
790                prop_assert_eq!(open_matches, bytes_matches);
791            }
792        }
793    }
794
795    #[derive(Clone, Debug)]
796    struct TestEntry<'a> {
797        hanja: &'a str,
798        reading: &'a str,
799        mark: MatchMark,
800    }
801
802    fn entry<'a>(
803        hanja: &'a str,
804        reading: &'a str,
805        require_hanja: bool,
806        require_hangul: bool,
807    ) -> TestEntry<'a> {
808        TestEntry {
809            hanja,
810            reading,
811            mark: MatchMark {
812                require_hanja,
813                require_hangul,
814            },
815        }
816    }
817
818    fn write_fixture(path: &Path, entries: &[TestEntry<'_>]) {
819        let mut metadata = BTreeMap::new();
820        metadata.insert("source".to_owned(), "fixture".to_owned());
821        metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
822        metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
823        metadata.insert("entry_count".to_owned(), entries.len().to_string());
824        metadata.insert("version".to_owned(), "1".to_owned());
825        metadata.insert(
826            "max_word_chars".to_owned(),
827            entries
828                .iter()
829                .map(|entry| entry.hanja.chars().count())
830                .max()
831                .unwrap_or(0)
832                .to_string(),
833        );
834        metadata.insert(
835            "max_key_bytes".to_owned(),
836            entries
837                .iter()
838                .map(|entry| entry.hanja.len())
839                .max()
840                .unwrap_or(0)
841                .to_string(),
842        );
843
844        let mut records = BTreeMap::<String, Option<(&str, MatchMark)>>::new();
845        for entry in entries {
846            let mut prefix = String::new();
847            for ch in entry.hanja.chars() {
848                prefix.push(ch);
849                records.entry(prefix.clone()).or_insert(None);
850            }
851            records.insert(entry.hanja.to_owned(), Some((entry.reading, entry.mark)));
852        }
853        metadata.insert("prefix_count".to_owned(), records.len().to_string());
854
855        let mut metadata_bytes = Vec::new();
856        into_writer(&metadata, &mut metadata_bytes).unwrap();
857        let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
858        writer.add(META_KEY, &metadata_bytes).unwrap();
859        for (key, value) in records {
860            writer.add(key.as_bytes(), &encode_record(value)).unwrap();
861        }
862        writer.finish().unwrap();
863        assert!(fs::metadata(path).unwrap().len() > 0);
864    }
865
866    fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
867        proptest::collection::btree_map(
868            "[一-龥]{1,3}",
869            ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
870            1..16,
871        )
872        .prop_map(|entries| {
873            entries
874                .into_iter()
875                .map(|(hanja, (reading, require_hanja, require_hangul))| {
876                    (hanja, reading, require_hanja, require_hangul)
877                })
878                .collect()
879        })
880    }
881}