1#![forbid(unsafe_code)]
20#![deny(missing_docs)]
21
22use std::collections::BTreeMap;
23use std::ops::Deref;
24use std::path::Path;
25use std::sync::Arc;
26
27use ciborium::de::from_reader;
28use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
29
30const META_KEY: &[u8] = b"__gukhanmun_meta__";
31const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
32const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
33
34const HEADER_SIZE: usize = 2048;
36
37enum DataSource {
42 Owned(Arc<[u8]>),
43 Static(&'static [u8]),
44}
45
46impl Deref for DataSource {
47 type Target = [u8];
48
49 fn deref(&self) -> &[u8] {
50 match self {
51 DataSource::Owned(arc) => arc,
52 DataSource::Static(s) => s,
53 }
54 }
55}
56
57pub struct CdbDictionary {
63 metadata: BTreeMap<String, String>,
64 data: DataSource,
65 entry_count: u64,
66 max_word_chars: Option<usize>,
67}
68
69impl CdbDictionary {
70 pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
72 let path = path.as_ref();
73 let bytes = std::fs::read(path).map_err(|source| Error::Open {
74 path: path.display().to_string(),
75 source,
76 })?;
77 Self::from_source(DataSource::Owned(Arc::from(bytes.as_slice())))
78 }
79
80 pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
82 Self::from_source(DataSource::Owned(Arc::from(bytes)))
83 }
84
85 pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
92 Self::from_source(DataSource::Static(bytes))
93 }
94
95 fn from_source(data: DataSource) -> Result<Self, Error> {
96 let metadata_bytes = cdb_get(&data, META_KEY)?.ok_or(Error::MissingRecord {
97 record: "dictionary metadata",
98 })?;
99 let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes.as_slice())
100 .map_err(|source| Error::MetadataDecode { source })?;
101 if let Some(version) = metadata.get("version")
102 && version != "1"
103 {
104 tracing::error!(version = %version, expected = "1", "unsupported CDB format version");
105 return Err(Error::UnsupportedVersion {
106 version: version.clone(),
107 });
108 }
109 let entry_count = parse_u64_metadata(&metadata, "entry_count").unwrap_or(0);
110 let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars");
111
112 tracing::info!(
113 format_version = metadata.get("version").map(String::as_str).unwrap_or("1"),
114 entry_count,
115 "loaded CDB dictionary"
116 );
117 Ok(Self {
118 metadata,
119 data,
120 entry_count,
121 max_word_chars,
122 })
123 }
124
125 pub fn metadata(&self) -> &BTreeMap<String, String> {
127 &self.metadata
128 }
129
130 pub fn entry_count(&self) -> u64 {
133 self.entry_count
134 }
135
136 pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
138 let Some(value) = cdb_get(&self.data, hanja.as_bytes())? else {
139 return Ok(None);
140 };
141 let Some(record) = decode_record(&value)? else {
142 return Ok(None);
143 };
144 Ok(Some(record))
145 }
146}
147
148impl HanjaDictionary for CdbDictionary {
149 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
150 let max_word_chars = self.max_word_chars.unwrap_or(usize::MAX);
151 let mut matches = Vec::new();
152 let mut prefix = String::new();
153
154 for (index, ch) in s.chars().enumerate() {
155 if index >= max_word_chars {
156 break;
157 }
158 prefix.push(ch);
159 let value = match cdb_get(&self.data, prefix.as_bytes()) {
160 Ok(Some(value)) => value,
161 Ok(None) => break,
162 Err(error) => {
163 tracing::warn!(
164 prefix_len = prefix.len(),
165 error = ?error,
166 "aborting CDB prefix traversal due to read error"
167 );
168 break;
169 }
170 };
171 match decode_record(&value) {
172 Ok(Some(entry)) => {
173 matches.push(Match {
174 byte_len: prefix.len(),
175 reading: entry.reading,
176 suffix_reading: None,
177 mark: entry.mark,
178 });
179 }
180 Ok(None) => {}
181 Err(error) => {
182 tracing::warn!(
183 prefix_len = prefix.len(),
184 error = ?error,
185 "aborting CDB prefix traversal due to decode error"
186 );
187 break;
188 }
189 }
190 }
191
192 Box::new(matches.into_iter())
193 }
194
195 fn max_word_chars(&self) -> Option<usize> {
196 self.max_word_chars
197 }
198
199 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
200 let mut records = Vec::new();
201 for result in cdb_iter(&self.data) {
202 let (key, value) = match result {
203 Ok(pair) => pair,
204 Err(error) => {
205 tracing::warn!(error = ?error, "skipping CDB entry due to iterator error");
206 continue;
207 }
208 };
209 if key == META_KEY {
210 continue;
211 }
212 let entry = match decode_record(&value) {
213 Ok(Some(entry)) => entry,
214 Ok(None) => continue,
215 Err(error) => {
216 tracing::warn!(error = ?error, "skipping malformed CDB entry");
217 continue;
218 }
219 };
220 let Ok(hanja) = String::from_utf8(key) else {
221 tracing::warn!("skipping CDB entry with non-UTF-8 key");
222 continue;
223 };
224 records.push(DictionaryRecord {
225 hanja,
226 reading: entry.reading,
227 mark: entry.mark,
228 });
229 }
230 Some(Box::new(records.into_iter()))
231 }
232
233 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
234 for result in cdb_iter(&self.data) {
235 let (key, value) = match result {
236 Ok(pair) => pair,
237 Err(_) => continue,
238 };
239 if key == META_KEY || key == hanja.as_bytes() {
240 continue;
241 }
242 if decode_record(&value).is_ok_and(|entry| entry.is_some_and(|e| e.reading == reading))
243 {
244 return true;
245 }
246 }
247 false
248 }
249}
250
251#[derive(Clone, Debug, Eq, PartialEq)]
253pub struct LookupEntry {
254 reading: String,
255 mark: MatchMark,
256}
257
258impl LookupEntry {
259 pub fn reading(&self) -> &str {
261 &self.reading
262 }
263
264 pub fn mark(&self) -> MatchMark {
266 self.mark
267 }
268}
269
270#[derive(Debug, thiserror::Error)]
272#[non_exhaustive]
273pub enum Error {
274 #[error("failed to open {path}: {source}")]
276 Open {
277 path: String,
279 #[source]
281 source: std::io::Error,
282 },
283
284 #[error("missing {record}")]
286 MissingRecord {
287 record: &'static str,
289 },
290
291 #[error("failed to decode dictionary metadata: {source}")]
293 MetadataDecode {
294 #[source]
296 source: ciborium::de::Error<std::io::Error>,
297 },
298
299 #[error("unsupported dictionary version {version}")]
301 UnsupportedVersion {
302 version: String,
304 },
305
306 #[error("malformed CDB record: {reason}")]
308 MalformedRecord {
309 reason: &'static str,
311 },
312
313 #[error("{field} overflow")]
315 ValueOverflow {
316 field: &'static str,
318 },
319
320 #[error("{field} is outside the CDB record")]
322 ValueOutOfBounds {
323 field: &'static str,
325 },
326
327 #[error("{field} contains invalid UTF-8: {source}")]
329 InvalidUtf8 {
330 field: &'static str,
332 #[source]
334 source: std::str::Utf8Error,
335 },
336
337 #[error("CDB data is too short: {len} bytes")]
339 TooShort {
340 len: usize,
342 },
343
344 #[error("CDB offset {offset} is out of bounds (data len {len})")]
346 OutOfBounds {
347 offset: usize,
349 len: usize,
351 },
352}
353
354fn cdb_hash(key: &[u8]) -> u32 {
358 key.iter().fold(5381u32, |h, &b| {
359 h.wrapping_shl(5).wrapping_add(h) ^ (b as u32)
360 })
361}
362
363fn read_u32(data: &[u8], offset: usize) -> Option<u32> {
366 data.get(offset..offset + 4)
367 .and_then(|b| b.try_into().ok())
368 .map(u32::from_le_bytes)
369}
370
371fn cdb_get(data: &[u8], key: &[u8]) -> Result<Option<Vec<u8>>, Error> {
374 if data.len() < HEADER_SIZE {
375 return Err(Error::TooShort { len: data.len() });
376 }
377
378 let h = cdb_hash(key);
379 let header_slot = (h & 0xff) as usize;
380 let header_base = header_slot * 8;
381
382 let table_pos = read_u32(data, header_base).ok_or(Error::OutOfBounds {
383 offset: header_base,
384 len: data.len(),
385 })? as usize;
386 let table_count = read_u32(data, header_base + 4).ok_or(Error::OutOfBounds {
387 offset: header_base + 4,
388 len: data.len(),
389 })? as usize;
390
391 if table_count == 0 {
392 return Ok(None);
393 }
394
395 let start_slot = ((h >> 8) as usize) % table_count;
396
397 for i in 0..table_count {
398 let slot = (start_slot + i) % table_count;
399 let slot_offset = table_pos + slot * 8;
400
401 let slot_hash = match read_u32(data, slot_offset) {
402 Some(v) => v,
403 None => return Ok(None),
404 };
405 let data_pos = match read_u32(data, slot_offset + 4) {
406 Some(v) => v as usize,
407 None => return Ok(None),
408 };
409
410 if data_pos == 0 {
411 return Ok(None);
412 }
413
414 if slot_hash == h {
415 let key_len = match read_u32(data, data_pos) {
416 Some(v) => v as usize,
417 None => continue,
418 };
419 let val_len = match read_u32(data, data_pos + 4) {
420 Some(v) => v as usize,
421 None => continue,
422 };
423 let key_start = data_pos + 8;
424 let val_start = key_start.saturating_add(key_len);
425 let val_end = val_start.saturating_add(val_len);
426
427 if val_end > data.len() {
428 continue;
429 }
430 if data[key_start..key_start + key_len] == *key {
431 return Ok(Some(data[val_start..val_end].to_vec()));
432 }
433 }
434 }
435
436 Ok(None)
437}
438
439fn cdb_iter(data: &[u8]) -> impl Iterator<Item = Result<(Vec<u8>, Vec<u8>), Error>> + '_ {
442 let data_end = (0..256usize)
444 .filter_map(|i| {
445 let pos = read_u32(data, i * 8)? as usize;
446 if pos >= HEADER_SIZE { Some(pos) } else { None }
447 })
448 .min()
449 .unwrap_or(data.len());
450
451 CdbIter {
452 data,
453 pos: HEADER_SIZE,
454 data_end,
455 }
456}
457
458struct CdbIter<'a> {
459 data: &'a [u8],
460 pos: usize,
461 data_end: usize,
462}
463
464impl<'a> Iterator for CdbIter<'a> {
465 type Item = Result<(Vec<u8>, Vec<u8>), Error>;
466
467 fn next(&mut self) -> Option<Self::Item> {
468 if self.pos >= self.data_end {
469 return None;
470 }
471
472 let key_len = read_u32(self.data, self.pos)? as usize;
473 let val_len = read_u32(self.data, self.pos + 4)? as usize;
474 let key_start = self.pos + 8;
475 let val_start = key_start + key_len;
476 let next_pos = val_start + val_len;
477
478 if next_pos > self.data_end {
479 self.pos = self.data_end;
480 return Some(Err(Error::OutOfBounds {
481 offset: next_pos,
482 len: self.data_end,
483 }));
484 }
485
486 let key = self.data[key_start..key_start + key_len].to_vec();
487 let val = self.data[val_start..val_start + val_len].to_vec();
488 self.pos = next_pos;
489 Some(Ok((key, val)))
490 }
491}
492
493fn decode_record(value: &[u8]) -> Result<Option<LookupEntry>, Error> {
496 if value.len() < 4 {
497 return Err(Error::MalformedRecord {
498 reason: "record is shorter than the fixed prefix",
499 });
500 }
501 if value[0] == 0 {
502 return Ok(None);
503 }
504
505 let mark = decode_mark(value[1]);
506 let reading_len = u16::from_le_bytes([value[2], value[3]]) as usize;
507 let reading_end = 4usize
508 .checked_add(reading_len)
509 .ok_or(Error::ValueOverflow {
510 field: "reading range",
511 })?;
512 let reading_bytes = value
513 .get(4..reading_end)
514 .ok_or(Error::ValueOutOfBounds { field: "reading" })?;
515 let reading = std::str::from_utf8(reading_bytes)
516 .map_err(|source| Error::InvalidUtf8 {
517 field: "reading",
518 source,
519 })?
520 .to_owned();
521
522 Ok(Some(LookupEntry { reading, mark }))
523}
524
525fn decode_mark(encoded: u8) -> MatchMark {
526 MatchMark {
527 require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
528 require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
529 }
530}
531
532fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
533 metadata.get(key).and_then(|value| value.parse().ok())
534}
535
536fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
537 metadata.get(key).and_then(|value| value.parse().ok())
538}
539
540#[cfg(test)]
543fn encode_record(entry: Option<(&str, MatchMark)>) -> Vec<u8> {
544 let mut output = Vec::new();
545 match entry {
546 Some((reading, mark)) => {
547 output.push(1);
548 output.push(encode_mark(mark));
549 output.extend_from_slice(&(reading.len() as u16).to_le_bytes());
550 output.extend_from_slice(reading.as_bytes());
551 }
552 None => {
553 output.push(0);
554 output.push(0);
555 output.extend_from_slice(&0u16.to_le_bytes());
556 }
557 }
558 output
559}
560
561#[cfg(test)]
562fn encode_mark(mark: MatchMark) -> u8 {
563 let mut encoded = 0;
564 if mark.require_hanja {
565 encoded |= MARK_REQUIRE_HANJA;
566 }
567 if mark.require_hangul {
568 encoded |= MARK_REQUIRE_HANGUL;
569 }
570 encoded
571}
572
573#[cfg(test)]
574mod tests {
575 use std::collections::BTreeMap;
576 use std::fs;
577 use std::path::Path;
578
579 use ciborium::ser::into_writer;
580 use gukhanmun_core::{HanjaDictionary, MapDictionary, MatchMark};
581 use proptest::prelude::*;
582 use tempfile::tempdir;
583 use tracing_test::traced_test;
584
585 use super::{CdbDictionary, META_KEY, encode_record};
586
587 #[traced_test]
588 #[test]
589 fn unsupported_version_emits_error_event() {
590 let temp = tempdir().unwrap();
591 let path = temp.path().join("dict.gukcdb");
592 let metadata = BTreeMap::from([("version".to_owned(), "99".to_owned())]);
593 let mut metadata_bytes = Vec::new();
594 into_writer(&metadata, &mut metadata_bytes).unwrap();
595 let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
596 writer.add(META_KEY, &metadata_bytes).unwrap();
597 writer.finish().unwrap();
598
599 let result = CdbDictionary::open(&path);
600
601 assert!(matches!(
602 result,
603 Err(super::Error::UnsupportedVersion { .. })
604 ));
605 assert!(logs_contain("unsupported CDB format version"));
606 }
607
608 #[test]
609 fn loads_metadata_lookup_and_prefix_matches() {
610 let temp = tempdir().unwrap();
611 let path = temp.path().join("dict.gukcdb");
612 write_fixture(
613 &path,
614 &[
615 entry("行事", "행사", false, false),
616 entry("行事場", "행사장", true, false),
617 entry("場所", "장소", false, true),
618 ],
619 );
620
621 let dictionary = CdbDictionary::open(&path).unwrap();
622
623 assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
624 assert_eq!(dictionary.entry_count(), 3);
625 assert_eq!(dictionary.max_word_chars(), Some(3));
626 let exact = dictionary.lookup("行事場").unwrap().unwrap();
627 assert_eq!(exact.reading(), "행사장");
628 assert!(exact.mark().require_hanja);
629 assert!(!exact.mark().require_hangul);
630 let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
631 assert_eq!(matches.len(), 2);
632 assert_eq!(matches[0].reading, "행사");
633 assert_eq!(matches[1].reading, "행사장");
634 }
635
636 #[test]
637 fn from_bytes_matches_open() {
638 let temp = tempdir().unwrap();
639 let path = temp.path().join("dict.gukcdb");
640 write_fixture(
641 &path,
642 &[
643 entry("行事", "행사", false, false),
644 entry("場所", "장소", false, false),
645 ],
646 );
647
648 let bytes = fs::read(&path).unwrap();
649 let from_bytes = CdbDictionary::from_bytes(&bytes).unwrap();
650 let from_open = CdbDictionary::open(&path).unwrap();
651
652 assert_eq!(from_bytes.metadata(), from_open.metadata());
653 assert_eq!(from_bytes.entry_count(), from_open.entry_count());
654 assert_eq!(from_bytes.max_word_chars(), from_open.max_word_chars());
655
656 let bytes_matches = from_bytes.matches_at("行事入口").collect::<Vec<_>>();
657 let open_matches = from_open.matches_at("行事入口").collect::<Vec<_>>();
658 assert_eq!(bytes_matches, open_matches);
659 }
660
661 #[test]
662 fn has_homophone_detects_other_forms_with_same_reading() {
663 let temp = tempdir().unwrap();
664 let path = temp.path().join("dict.gukcdb");
665 write_fixture(
666 &path,
667 &[
668 entry("漢字", "한자", false, false),
669 entry("翰字", "한자", false, false),
670 entry("天地", "천지", false, false),
671 ],
672 );
673 let dictionary = CdbDictionary::open(&path).unwrap();
674
675 assert!(dictionary.has_homophone("漢字", "한자"));
676 assert!(!dictionary.has_homophone("天地", "천지"));
677 }
678
679 #[test]
680 fn open_errors_preserve_structured_variants_and_sources() {
681 let temp = tempdir().unwrap();
682 let path = temp.path().join("dict.gukcdb");
683 let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
684 writer.add(META_KEY, &[0xff]).unwrap();
685 writer.finish().unwrap();
686
687 let error = match CdbDictionary::open(&path) {
688 Ok(_) => panic!("corrupt metadata should fail to open"),
689 Err(error) => error,
690 };
691
692 assert!(matches!(error, super::Error::MetadataDecode { .. }));
693 assert!(std::error::Error::source(&error).is_some());
694 }
695
696 #[test]
697 fn lookup_errors_distinguish_malformed_records() {
698 let temp = tempdir().unwrap();
699 let path = temp.path().join("dict.gukcdb");
700 let metadata = BTreeMap::from([
701 ("entry_count".to_owned(), "1".to_owned()),
702 ("version".to_owned(), "1".to_owned()),
703 ("max_word_chars".to_owned(), "2".to_owned()),
704 ]);
705 let mut metadata_bytes = Vec::new();
706 into_writer(&metadata, &mut metadata_bytes).unwrap();
707 let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
708 writer.add(META_KEY, &metadata_bytes).unwrap();
709 writer.add("天地".as_bytes(), &[1, 0, 1, 0, 0xff]).unwrap();
710 writer.finish().unwrap();
711 let dictionary = CdbDictionary::open(&path).unwrap();
712
713 let error = dictionary.lookup("天地").unwrap_err();
714
715 assert!(matches!(
716 error,
717 super::Error::InvalidUtf8 {
718 field: "reading",
719 ..
720 }
721 ));
722 assert!(std::error::Error::source(&error).is_some());
723 }
724
725 proptest! {
726 #[test]
727 fn generated_cdb_matches_map_dictionary(entries in unique_entries()) {
728 let temp = tempdir().unwrap();
729 let path = temp.path().join("dict.gukcdb");
730 let fixture_entries = entries
731 .iter()
732 .map(|(hanja, reading, require_hanja, require_hangul)| {
733 TestEntry {
734 hanja,
735 reading,
736 mark: MatchMark {
737 require_hanja: *require_hanja,
738 require_hangul: *require_hangul,
739 },
740 }
741 })
742 .collect::<Vec<_>>();
743 write_fixture(&path, &fixture_entries);
744 let cdb = CdbDictionary::open(&path).unwrap();
745 let mut map = MapDictionary::new();
746
747 for (hanja, reading, require_hanja, require_hangul) in entries {
748 map.insert_marked(
749 &hanja,
750 &reading,
751 MatchMark {
752 require_hanja,
753 require_hangul,
754 },
755 );
756 let cdb_matches = cdb.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
757 let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758 prop_assert_eq!(cdb_matches, map_matches);
759 let lookup = cdb.lookup(&hanja).unwrap().unwrap();
760 prop_assert_eq!(lookup.reading(), reading.as_str());
761 }
762 }
763
764 #[test]
765 fn from_bytes_matches_open_proptest(entries in unique_entries()) {
766 let temp = tempdir().unwrap();
767 let path = temp.path().join("dict.gukcdb");
768 let fixture_entries = entries
769 .iter()
770 .map(|(hanja, reading, require_hanja, require_hangul)| {
771 TestEntry {
772 hanja,
773 reading,
774 mark: MatchMark {
775 require_hanja: *require_hanja,
776 require_hangul: *require_hangul,
777 },
778 }
779 })
780 .collect::<Vec<_>>();
781 write_fixture(&path, &fixture_entries);
782 let bytes = fs::read(&path).unwrap();
783
784 let from_open = CdbDictionary::open(&path).unwrap();
785 let from_bytes = CdbDictionary::from_bytes(&bytes).unwrap();
786
787 for (hanja, ..) in &entries {
788 let open_matches = from_open.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
789 let bytes_matches = from_bytes.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
790 prop_assert_eq!(open_matches, bytes_matches);
791 }
792 }
793 }
794
795 #[derive(Clone, Debug)]
796 struct TestEntry<'a> {
797 hanja: &'a str,
798 reading: &'a str,
799 mark: MatchMark,
800 }
801
802 fn entry<'a>(
803 hanja: &'a str,
804 reading: &'a str,
805 require_hanja: bool,
806 require_hangul: bool,
807 ) -> TestEntry<'a> {
808 TestEntry {
809 hanja,
810 reading,
811 mark: MatchMark {
812 require_hanja,
813 require_hangul,
814 },
815 }
816 }
817
818 fn write_fixture(path: &Path, entries: &[TestEntry<'_>]) {
819 let mut metadata = BTreeMap::new();
820 metadata.insert("source".to_owned(), "fixture".to_owned());
821 metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
822 metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
823 metadata.insert("entry_count".to_owned(), entries.len().to_string());
824 metadata.insert("version".to_owned(), "1".to_owned());
825 metadata.insert(
826 "max_word_chars".to_owned(),
827 entries
828 .iter()
829 .map(|entry| entry.hanja.chars().count())
830 .max()
831 .unwrap_or(0)
832 .to_string(),
833 );
834 metadata.insert(
835 "max_key_bytes".to_owned(),
836 entries
837 .iter()
838 .map(|entry| entry.hanja.len())
839 .max()
840 .unwrap_or(0)
841 .to_string(),
842 );
843
844 let mut records = BTreeMap::<String, Option<(&str, MatchMark)>>::new();
845 for entry in entries {
846 let mut prefix = String::new();
847 for ch in entry.hanja.chars() {
848 prefix.push(ch);
849 records.entry(prefix.clone()).or_insert(None);
850 }
851 records.insert(entry.hanja.to_owned(), Some((entry.reading, entry.mark)));
852 }
853 metadata.insert("prefix_count".to_owned(), records.len().to_string());
854
855 let mut metadata_bytes = Vec::new();
856 into_writer(&metadata, &mut metadata_bytes).unwrap();
857 let mut writer = cdb::CDBWriter::create(path.to_string_lossy().as_ref()).unwrap();
858 writer.add(META_KEY, &metadata_bytes).unwrap();
859 for (key, value) in records {
860 writer.add(key.as_bytes(), &encode_record(value)).unwrap();
861 }
862 writer.finish().unwrap();
863 assert!(fs::metadata(path).unwrap().len() > 0);
864 }
865
866 fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
867 proptest::collection::btree_map(
868 "[一-龥]{1,3}",
869 ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
870 1..16,
871 )
872 .prop_map(|entries| {
873 entries
874 .into_iter()
875 .map(|(hanja, (reading, require_hanja, require_hangul))| {
876 (hanja, reading, require_hanja, require_hangul)
877 })
878 .collect()
879 })
880 }
881}