1#![deny(missing_docs)]
20#![deny(unsafe_code)]
21#![deny(unsafe_op_in_unsafe_fn)]
22
23use std::collections::BTreeMap;
24use std::fs;
25use std::io::{Cursor, Read};
26use std::ops::Range;
27use std::path::Path;
28use std::sync::Arc;
29
30use ciborium::de::from_reader;
31use fst::automaton::Automaton;
32use fst::{IntoStreamer, Map, Streamer};
33use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
34
35const MAGIC: &[u8; 8] = b"GUKHMFST";
36const FORMAT_VERSION: u32 = 1;
37const FIXED_HEADER_LEN: usize = 64;
38const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
39const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
40const VALUE_READING_LEN_MASK: u64 = 0xffff;
41const VALUE_MARK_SHIFT: u64 = 16;
42const VALUE_OFFSET_SHIFT: u64 = 24;
43
44#[derive(Clone, Debug)]
51pub struct FstDictionary {
52 metadata: BTreeMap<String, String>,
53 map: Map<ByteSection>,
54 readings: ByteSection,
55 entry_count: u64,
56 max_word_chars: Option<usize>,
57}
58
59impl FstDictionary {
60 pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
62 let path = path.as_ref();
63 tracing::info!(path = %path.display(), "opening FST dictionary");
64 let bytes = fs::read(path).map_err(|source| Error::Io {
65 path: path.display().to_string(),
66 source,
67 })?;
68 Self::from_bytes(&bytes)
69 }
70
71 pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
73 Self::from_source(ByteSource::Owned(Arc::<[u8]>::from(bytes)))
74 }
75
76 pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
82 Self::from_source(ByteSource::Static(bytes))
83 }
84
85 fn from_source(source: ByteSource) -> Result<Self, Error> {
86 let bytes = source.as_ref();
87 let header = FixedHeader::parse(bytes)?;
88 let metadata_bytes = checked_slice(bytes, header.metadata_offset, header.metadata_len)
89 .ok_or(Error::SectionOutOfBounds {
90 section: "metadata",
91 })?;
92 let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes)
93 .map_err(|source| Error::MetadataDecode { source })?;
94 let fst_bytes = source
95 .section(header.fst_offset, header.fst_len)
96 .ok_or(Error::SectionOutOfBounds { section: "FST" })?;
97 let readings = source
98 .section(header.readings_offset, header.readings_len)
99 .ok_or(Error::SectionOutOfBounds {
100 section: "readings",
101 })?;
102 let map = Map::new(fst_bytes).map_err(|source| Error::FstDecode { source })?;
103 let entry_count = parse_u64_metadata(&metadata, "entry_count")
104 .unwrap_or_else(|| u64::try_from(map.len()).unwrap_or(u64::MAX));
105 let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars")
106 .or_else(|| max_key_chars_from_map(&map));
107
108 tracing::debug!(
109 byte_length = bytes.len(),
110 format_version = FORMAT_VERSION,
111 entry_count,
112 ?max_word_chars,
113 "decoded FST dictionary"
114 );
115 Ok(Self {
116 metadata,
117 map,
118 readings,
119 entry_count,
120 max_word_chars,
121 })
122 }
123
124 pub fn metadata(&self) -> &BTreeMap<String, String> {
126 &self.metadata
127 }
128
129 pub fn entry_count(&self) -> u64 {
131 self.entry_count
132 }
133
134 pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
136 let Some(encoded) = self.map.get(hanja.as_bytes()) else {
137 return Ok(None);
138 };
139 self.decode_entry(encoded).map(Some)
140 }
141
142 fn decode_entry(&self, encoded: u64) -> Result<LookupEntry, Error> {
143 let (reading_len, mark, reading_offset) = decode_value(encoded);
144 let reading_start =
145 usize::try_from(reading_offset).map_err(|_| Error::ValueOutOfRange {
146 field: "reading offset",
147 })?;
148 let reading_end =
149 reading_start
150 .checked_add(usize::from(reading_len))
151 .ok_or(Error::ValueOverflow {
152 field: "reading range",
153 })?;
154 let reading_bytes = self
155 .readings
156 .as_ref()
157 .get(reading_start..reading_end)
158 .ok_or(Error::SectionOutOfBounds {
159 section: "reading table entry",
160 })?;
161 let reading = std::str::from_utf8(reading_bytes)
162 .map_err(|source| Error::InvalidUtf8 {
163 field: "reading",
164 source,
165 })?
166 .to_owned();
167
168 Ok(LookupEntry { reading, mark })
169 }
170}
171
172#[derive(Clone, Debug)]
173enum ByteSource {
174 Owned(Arc<[u8]>),
175 Static(&'static [u8]),
176}
177
178impl ByteSource {
179 fn section(&self, offset: u64, len: u64) -> Option<ByteSection> {
180 let offset = usize::try_from(offset).ok()?;
181 let len = usize::try_from(len).ok()?;
182 let end = offset.checked_add(len)?;
183 (end <= self.as_ref().len()).then(|| ByteSection {
184 source: self.clone(),
185 range: offset..end,
186 })
187 }
188}
189
190impl AsRef<[u8]> for ByteSource {
191 fn as_ref(&self) -> &[u8] {
192 match self {
193 Self::Owned(bytes) => bytes,
194 Self::Static(bytes) => bytes,
195 }
196 }
197}
198
199#[derive(Clone, Debug)]
200struct ByteSection {
201 source: ByteSource,
202 range: Range<usize>,
203}
204
205impl AsRef<[u8]> for ByteSection {
206 fn as_ref(&self) -> &[u8] {
207 &self.source.as_ref()[self.range.clone()]
208 }
209}
210
211impl HanjaDictionary for FstDictionary {
212 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
213 let mut stream = self
214 .map
215 .search(KeyIsPrefixOf::new(s.as_bytes()))
216 .into_stream();
217 let mut matches = Vec::new();
218 while let Some((key, encoded)) = stream.next() {
219 match self.decode_entry(encoded) {
220 Ok(entry) => {
221 matches.push(Match {
222 byte_len: key.len(),
223 reading: entry.reading,
224 mark: entry.mark,
225 });
226 }
227 Err(error) => {
228 if let Ok(key_str) = std::str::from_utf8(key) {
229 tracing::warn!(key = key_str, error = ?error, "skipping FST entry with undecodable value");
230 } else {
231 tracing::warn!(key_len = key.len(), error = ?error, "skipping FST entry with non-UTF-8 key and undecodable value");
232 }
233 }
234 }
235 }
236 matches.sort_by_key(|matched| matched.byte_len);
237 Box::new(matches.into_iter())
238 }
239
240 fn max_word_chars(&self) -> Option<usize> {
241 self.max_word_chars
242 }
243
244 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
245 let mut stream = self.map.stream();
246 let mut records = Vec::new();
247 while let Some((key, encoded)) = stream.next() {
248 let Ok(hanja) = std::str::from_utf8(key) else {
249 continue;
250 };
251 if let Ok(entry) = self.decode_entry(encoded) {
252 records.push(DictionaryRecord {
253 hanja: hanja.to_owned(),
254 reading: entry.reading,
255 mark: entry.mark,
256 });
257 }
258 }
259 Some(Box::new(records.into_iter()))
260 }
261
262 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
263 let mut stream = self.map.stream();
264 while let Some((key, encoded)) = stream.next() {
265 if key == hanja.as_bytes() {
266 continue;
267 }
268 if self
269 .decode_entry(encoded)
270 .is_ok_and(|entry| entry.reading == reading)
271 {
272 return true;
273 }
274 }
275 false
276 }
277}
278
279#[derive(Clone, Debug, Eq, PartialEq)]
281pub struct LookupEntry {
282 reading: String,
283 mark: MatchMark,
284}
285
286impl LookupEntry {
287 pub fn reading(&self) -> &str {
289 &self.reading
290 }
291
292 pub fn mark(&self) -> MatchMark {
294 self.mark
295 }
296}
297
298#[derive(Debug, thiserror::Error)]
300#[non_exhaustive]
301pub enum Error {
302 #[error("failed to read {path}: {source}")]
304 Io {
305 path: String,
307 #[source]
309 source: std::io::Error,
310 },
311
312 #[error("dictionary file is shorter than the fixed header: {actual} bytes")]
314 ShortHeader {
315 actual: usize,
317 },
318
319 #[error("invalid dictionary magic")]
321 InvalidMagic,
322
323 #[error("unsupported dictionary version {version}")]
325 UnsupportedVersion {
326 version: u32,
328 },
329
330 #[error("unsupported dictionary header length {header_len}")]
332 UnsupportedHeaderLength {
333 header_len: u32,
335 },
336
337 #[error("failed to read dictionary header: {source}")]
339 HeaderRead {
340 #[source]
342 source: std::io::Error,
343 },
344
345 #[error("{section} range is outside the file")]
347 SectionOutOfBounds {
348 section: &'static str,
350 },
351
352 #[error("failed to decode dictionary metadata: {source}")]
354 MetadataDecode {
355 #[source]
357 source: ciborium::de::Error<std::io::Error>,
358 },
359
360 #[error("failed to decode FST map: {source}")]
362 FstDecode {
363 #[source]
365 source: fst::Error,
366 },
367
368 #[error("{field} is too large")]
370 ValueOutOfRange {
371 field: &'static str,
373 },
374
375 #[error("{field} overflow")]
377 ValueOverflow {
378 field: &'static str,
380 },
381
382 #[error("{field} contains invalid UTF-8: {source}")]
384 InvalidUtf8 {
385 field: &'static str,
387 #[source]
389 source: std::str::Utf8Error,
390 },
391}
392
393#[derive(Clone, Copy, Debug, Eq, PartialEq)]
394struct FixedHeader {
395 metadata_offset: u64,
396 metadata_len: u64,
397 fst_offset: u64,
398 fst_len: u64,
399 readings_offset: u64,
400 readings_len: u64,
401}
402
403impl FixedHeader {
404 fn parse(bytes: &[u8]) -> Result<Self, Error> {
405 if bytes.len() < FIXED_HEADER_LEN {
406 return Err(Error::ShortHeader {
407 actual: bytes.len(),
408 });
409 }
410 if &bytes[..8] != MAGIC {
411 return Err(Error::InvalidMagic);
412 }
413 let version = read_u32(&bytes[8..12]);
414 if version != FORMAT_VERSION {
415 tracing::error!(
416 version,
417 expected = FORMAT_VERSION,
418 "unsupported FST format version"
419 );
420 return Err(Error::UnsupportedVersion { version });
421 }
422 let header_len = read_u32(&bytes[12..16]);
423 if header_len != FIXED_HEADER_LEN as u32 {
424 return Err(Error::UnsupportedHeaderLength { header_len });
425 }
426 let mut cursor = Cursor::new(&bytes[16..FIXED_HEADER_LEN]);
427 Ok(Self {
428 metadata_offset: read_next_u64(&mut cursor)?,
429 metadata_len: read_next_u64(&mut cursor)?,
430 fst_offset: read_next_u64(&mut cursor)?,
431 fst_len: read_next_u64(&mut cursor)?,
432 readings_offset: read_next_u64(&mut cursor)?,
433 readings_len: read_next_u64(&mut cursor)?,
434 })
435 }
436}
437
438#[derive(Clone, Copy, Debug)]
439struct KeyIsPrefixOf<'a> {
440 bytes: &'a [u8],
441}
442
443impl<'a> KeyIsPrefixOf<'a> {
444 fn new(bytes: &'a [u8]) -> Self {
445 Self { bytes }
446 }
447}
448
449impl Automaton for KeyIsPrefixOf<'_> {
450 type State = Option<usize>;
451
452 fn start(&self) -> Self::State {
453 Some(0)
454 }
455
456 fn is_match(&self, state: &Self::State) -> bool {
457 state.is_some()
458 }
459
460 fn can_match(&self, state: &Self::State) -> bool {
461 state.is_some()
462 }
463
464 fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
465 let position = (*state)?;
466 if self.bytes.get(position).copied() == Some(byte) {
467 Some(position + 1)
468 } else {
469 None
470 }
471 }
472}
473
474fn decode_value(value: u64) -> (u16, MatchMark, u64) {
475 let reading_len = (value & VALUE_READING_LEN_MASK) as u16;
476 let mark = decode_mark(((value >> VALUE_MARK_SHIFT) & 0xff) as u8);
477 let reading_offset = value >> VALUE_OFFSET_SHIFT;
478 (reading_len, mark, reading_offset)
479}
480
481fn decode_mark(encoded: u8) -> MatchMark {
482 MatchMark {
483 require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
484 require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
485 }
486}
487
488fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
489 metadata.get(key).and_then(|value| value.parse().ok())
490}
491
492fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
493 metadata.get(key).and_then(|value| value.parse().ok())
494}
495
496fn max_key_chars_from_map<D>(map: &Map<D>) -> Option<usize>
497where
498 D: AsRef<[u8]>,
499{
500 let mut stream = map.keys();
501 let mut max = None;
502 while let Some(key) = stream.next() {
503 let Ok(key) = std::str::from_utf8(key) else {
504 continue;
505 };
506 let chars = key.chars().count();
507 max = Some(max.map_or(chars, |current: usize| current.max(chars)));
508 }
509 max
510}
511
512fn read_u32(bytes: &[u8]) -> u32 {
513 u32::from_le_bytes(bytes.try_into().expect("slice has exactly four bytes"))
514}
515
516fn read_next_u64(cursor: &mut Cursor<&[u8]>) -> Result<u64, Error> {
517 let mut bytes = [0; 8];
518 cursor
519 .read_exact(&mut bytes)
520 .map_err(|source| Error::HeaderRead { source })?;
521 Ok(u64::from_le_bytes(bytes))
522}
523
524fn checked_slice(bytes: &[u8], offset: u64, len: u64) -> Option<&[u8]> {
525 let offset = usize::try_from(offset).ok()?;
526 let len = usize::try_from(len).ok()?;
527 bytes.get(offset..offset.checked_add(len)?)
528}
529
530#[cfg(test)]
531mod tests {
532 use std::collections::BTreeMap;
533 use std::fs;
534
535 use ciborium::ser::into_writer;
536 use fst::MapBuilder;
537 use gukhanmun_core::{MapDictionary, RenderMode, convert_plain_text};
538 use proptest::prelude::*;
539 use tempfile::tempdir;
540 use tracing_test::traced_test;
541
542 use super::{FstDictionary, HanjaDictionary, MatchMark};
543
544 const MAGIC: &[u8; 8] = b"GUKHMFST";
545 const FORMAT_VERSION: u32 = 1;
546 const FIXED_HEADER_LEN: usize = 64;
547 const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
548 const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
549 const VALUE_MARK_SHIFT: u64 = 16;
550 const VALUE_OFFSET_SHIFT: u64 = 24;
551
552 #[test]
553 fn loads_valid_bytes_metadata_and_lookup() {
554 let bytes = fixture_bytes(&[
555 entry("天地", "천지", false, false),
556 entry("漢字", "한자", true, false),
557 entry("色깔論", "색깔론", false, true),
558 ]);
559
560 let dictionary = FstDictionary::from_bytes(&bytes).unwrap();
561
562 assert_eq!(dictionary.entry_count(), 3);
563 assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
564 assert_eq!(dictionary.max_word_chars(), Some(3));
565 let hanja = dictionary.lookup("漢字").unwrap().unwrap();
566 assert_eq!(hanja.reading(), "한자");
567 assert!(hanja.mark().require_hanja);
568 assert!(!hanja.mark().require_hangul);
569 let mixed = dictionary.lookup("色깔論").unwrap().unwrap();
570 assert_eq!(mixed.reading(), "색깔론");
571 assert!(!mixed.mark().require_hanja);
572 assert!(mixed.mark().require_hangul);
573 }
574
575 #[test]
576 fn open_reads_a_dictionary_file() {
577 let temp = tempdir().unwrap();
578 let path = temp.path().join("dict.gukfst");
579 fs::write(&path, fixture_bytes(&[entry("天地", "천지", false, false)])).unwrap();
580
581 let dictionary = FstDictionary::open(&path).unwrap();
582
583 assert_eq!(
584 dictionary.lookup("天地").unwrap().unwrap().reading(),
585 "천지"
586 );
587 }
588
589 #[test]
590 fn from_static_bytes_matches_owned_loading() {
591 let bytes = fixture_bytes(&[
592 entry("天地", "천지", false, false),
593 entry("漢字", "한자", true, false),
594 entry("色깔論", "색깔론", false, true),
595 ]);
596 let static_bytes = Box::leak(bytes.clone().into_boxed_slice());
597 let owned = FstDictionary::from_bytes(&bytes).unwrap();
598 let static_dict = FstDictionary::from_static_bytes(static_bytes).unwrap();
599
600 assert_equivalent_dictionaries(&owned, &static_dict);
601 }
602
603 #[test]
604 fn dictionary_is_send_sync() {
605 fn assert_send_sync<T: Send + Sync>() {}
606
607 assert_send_sync::<FstDictionary>();
608 }
609
610 #[traced_test]
611 #[test]
612 fn unsupported_version_emits_error_event() {
613 let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
614 let mut bad_version = valid.clone();
615 bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
616
617 let result = FstDictionary::from_bytes(&bad_version);
618
619 assert!(matches!(
620 result.unwrap_err(),
621 super::Error::UnsupportedVersion { version: 999 }
622 ));
623 assert!(logs_contain("unsupported FST format version"));
624 }
625
626 #[test]
627 fn rejects_malformed_headers() {
628 let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
629 let mut bad_magic = valid.clone();
630 bad_magic[0] = b'X';
631 assert!(matches!(
632 FstDictionary::from_bytes(&bad_magic).unwrap_err(),
633 super::Error::InvalidMagic
634 ));
635
636 let mut bad_version = valid.clone();
637 bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
638 assert!(matches!(
639 FstDictionary::from_bytes(&bad_version).unwrap_err(),
640 super::Error::UnsupportedVersion { version: 999 }
641 ));
642
643 let truncated = &valid[..valid.len() - 1];
644 assert!(matches!(
645 FstDictionary::from_bytes(truncated).unwrap_err(),
646 super::Error::SectionOutOfBounds {
647 section: "readings"
648 }
649 ));
650 }
651
652 #[test]
653 fn decode_errors_preserve_structured_variants_and_sources() {
654 let mut invalid_metadata = fixture_bytes(&[entry("天地", "천지", false, false)]);
655 let metadata_offset = FIXED_HEADER_LEN;
656 invalid_metadata[metadata_offset] = 0xff;
657 let metadata_error = FstDictionary::from_bytes(&invalid_metadata).unwrap_err();
658 assert!(matches!(
659 metadata_error,
660 super::Error::MetadataDecode { .. }
661 ));
662 assert!(std::error::Error::source(&metadata_error).is_some());
663
664 let mut invalid_reading = fixture_bytes(&[entry("天地", "천지", false, false)]);
665 *invalid_reading.last_mut().unwrap() = 0xff;
666 let dictionary = FstDictionary::from_bytes(&invalid_reading).unwrap();
667 let utf8_error = dictionary.lookup("天地").unwrap_err();
668 assert!(matches!(
669 utf8_error,
670 super::Error::InvalidUtf8 {
671 field: "reading",
672 ..
673 }
674 ));
675 assert!(std::error::Error::source(&utf8_error).is_some());
676 }
677
678 #[test]
679 fn matches_at_returns_every_prefix_match() {
680 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
681 entry("行事", "행사", false, false),
682 entry("行事場", "행사장", false, false),
683 entry("場所", "장소", false, false),
684 ]))
685 .unwrap();
686
687 let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
688
689 assert_eq!(matches.len(), 2);
690 assert_eq!(matches[0].byte_len, "行事".len());
691 assert_eq!(matches[0].reading, "행사");
692 assert_eq!(matches[1].byte_len, "行事場".len());
693 assert_eq!(matches[1].reading, "행사장");
694 }
695
696 #[test]
697 fn has_homophone_detects_other_forms_with_same_reading() {
698 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
699 entry("漢字", "한자", false, false),
700 entry("翰字", "한자", false, false),
701 entry("天地", "천지", false, false),
702 ]))
703 .unwrap();
704
705 assert!(dictionary.has_homophone("漢字", "한자"));
706 assert!(!dictionary.has_homophone("天地", "천지"));
707 }
708
709 #[test]
710 fn lattice_regressions_pass_with_fst_backend() {
711 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
712 entry("行事", "행사", false, false),
713 entry("行事場", "행사장", false, false),
714 entry("場所", "장소", false, false),
715 entry("入口", "입구", false, false),
716 entry("汽車길", "기찻길", false, false),
717 ]))
718 .unwrap();
719
720 assert_eq!(
721 convert_plain_text("行事場入口", &dictionary, RenderMode::HangulHanjaParens),
722 "행사장(行事場)입구(入口)"
723 );
724 assert_eq!(
725 convert_plain_text("行事場所", &dictionary, RenderMode::HangulHanjaParens),
726 "행사(行事)장소(場所)"
727 );
728 assert_eq!(
729 convert_plain_text("汽車길", &dictionary, RenderMode::HangulHanjaParens),
730 "기찻길(汽車길)"
731 );
732 }
733
734 proptest! {
735 #[test]
736 fn generated_fst_matches_map_dictionary(entries in unique_entries()) {
737 let bytes = fixture_bytes(
738 &entries
739 .iter()
740 .map(|(hanja, reading, require_hanja, require_hangul)| {
741 entry(hanja, reading, *require_hanja, *require_hangul)
742 })
743 .collect::<Vec<_>>()
744 );
745 let fst = FstDictionary::from_bytes(&bytes).unwrap();
746 let mut map = MapDictionary::new();
747 for (hanja, reading, require_hanja, require_hangul) in entries {
748 map.insert_marked(
749 &hanja,
750 &reading,
751 MatchMark {
752 require_hanja,
753 require_hangul,
754 },
755 );
756 let fst_matches = fst.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
757 let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758 prop_assert_eq!(fst_matches, map_matches);
759 let lookup = fst.lookup(&hanja).unwrap().unwrap();
760 prop_assert_eq!(lookup.reading(), reading.as_str());
761 }
762 }
763 }
764
765 #[derive(Clone, Copy, Debug)]
766 struct TestEntry<'a> {
767 hanja: &'a str,
768 reading: &'a str,
769 mark: MatchMark,
770 }
771
772 fn entry<'a>(
773 hanja: &'a str,
774 reading: &'a str,
775 require_hanja: bool,
776 require_hangul: bool,
777 ) -> TestEntry<'a> {
778 TestEntry {
779 hanja,
780 reading,
781 mark: MatchMark {
782 require_hanja,
783 require_hangul,
784 },
785 }
786 }
787
788 fn fixture_bytes(entries: &[TestEntry<'_>]) -> Vec<u8> {
789 let mut metadata = BTreeMap::new();
790 metadata.insert("source".to_owned(), "fixture".to_owned());
791 metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
792 metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
793 metadata.insert("entry_count".to_owned(), entries.len().to_string());
794 metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
795 metadata.insert(
796 "max_word_chars".to_owned(),
797 entries
798 .iter()
799 .map(|entry| entry.hanja.chars().count())
800 .max()
801 .unwrap_or(0)
802 .to_string(),
803 );
804 metadata.insert(
805 "max_key_bytes".to_owned(),
806 entries
807 .iter()
808 .map(|entry| entry.hanja.len())
809 .max()
810 .unwrap_or(0)
811 .to_string(),
812 );
813 let mut metadata_bytes = Vec::new();
814 into_writer(&metadata, &mut metadata_bytes).unwrap();
815
816 let mut readings = Vec::new();
817 let mut builder = MapBuilder::memory();
818 let mut sorted = entries.to_vec();
819 sorted.sort_by(|left, right| left.hanja.cmp(right.hanja));
820 for entry in sorted {
821 let reading_offset = readings.len() as u64;
822 let value = (entry.reading.len() as u64)
823 | (u64::from(encode_mark(entry.mark)) << VALUE_MARK_SHIFT)
824 | (reading_offset << VALUE_OFFSET_SHIFT);
825 builder.insert(entry.hanja.as_bytes(), value).unwrap();
826 readings.extend_from_slice(entry.reading.as_bytes());
827 }
828 let fst_bytes = builder.into_inner().unwrap();
829
830 let metadata_offset = FIXED_HEADER_LEN as u64;
831 let fst_offset = metadata_offset + metadata_bytes.len() as u64;
832 let readings_offset = fst_offset + fst_bytes.len() as u64;
833 let mut output = Vec::new();
834 output.extend_from_slice(MAGIC);
835 output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
836 output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
837 output.extend_from_slice(&metadata_offset.to_le_bytes());
838 output.extend_from_slice(&(metadata_bytes.len() as u64).to_le_bytes());
839 output.extend_from_slice(&fst_offset.to_le_bytes());
840 output.extend_from_slice(&(fst_bytes.len() as u64).to_le_bytes());
841 output.extend_from_slice(&readings_offset.to_le_bytes());
842 output.extend_from_slice(&(readings.len() as u64).to_le_bytes());
843 output.extend(metadata_bytes);
844 output.extend(fst_bytes);
845 output.extend(readings);
846 output
847 }
848
849 fn encode_mark(mark: MatchMark) -> u8 {
850 let mut encoded = 0;
851 if mark.require_hanja {
852 encoded |= MARK_REQUIRE_HANJA;
853 }
854 if mark.require_hangul {
855 encoded |= MARK_REQUIRE_HANGUL;
856 }
857 encoded
858 }
859
860 fn assert_equivalent_dictionaries(left: &FstDictionary, right: &FstDictionary) {
861 assert_eq!(left.metadata(), right.metadata());
862 assert_eq!(left.entry_count(), right.entry_count());
863 assert_eq!(left.max_word_chars(), right.max_word_chars());
864 for key in ["天地", "漢字", "色깔論"] {
865 assert_eq!(left.lookup(key).unwrap(), right.lookup(key).unwrap());
866 }
867 assert_eq!(
868 left.matches_at("色깔論이다").collect::<Vec<_>>(),
869 right.matches_at("色깔論이다").collect::<Vec<_>>()
870 );
871 assert_eq!(
872 left.entries().unwrap().collect::<Vec<_>>(),
873 right.entries().unwrap().collect::<Vec<_>>()
874 );
875 assert_eq!(
876 left.has_homophone("漢字", "한자"),
877 right.has_homophone("漢字", "한자")
878 );
879 }
880
881 fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
882 proptest::collection::btree_map(
883 "[一-龥]{1,3}",
884 ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
885 1..16,
886 )
887 .prop_map(|entries| {
888 entries
889 .into_iter()
890 .map(|(hanja, (reading, require_hanja, require_hangul))| {
891 (hanja, reading, require_hanja, require_hangul)
892 })
893 .collect()
894 })
895 }
896}