1#![deny(missing_docs)]
20#![deny(unsafe_code)]
21#![deny(unsafe_op_in_unsafe_fn)]
22
23use std::collections::BTreeMap;
24use std::fs;
25use std::io::{Cursor, Read};
26use std::ops::Range;
27use std::path::Path;
28use std::sync::Arc;
29
30use ciborium::de::from_reader;
31use fst::automaton::Automaton;
32use fst::{IntoStreamer, Map, Streamer};
33use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
34
35const MAGIC: &[u8; 8] = b"GUKHMFST";
36const FORMAT_VERSION: u32 = 1;
37const FIXED_HEADER_LEN: usize = 64;
38const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
39const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
40const VALUE_READING_LEN_MASK: u64 = 0xffff;
41const VALUE_MARK_SHIFT: u64 = 16;
42const VALUE_OFFSET_SHIFT: u64 = 24;
43
44#[derive(Clone, Debug)]
51pub struct FstDictionary {
52 metadata: BTreeMap<String, String>,
53 map: Map<ByteSection>,
54 readings: ByteSection,
55 entry_count: u64,
56 max_word_chars: Option<usize>,
57}
58
59impl FstDictionary {
60 pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
62 let path = path.as_ref();
63 tracing::info!(path = %path.display(), "opening FST dictionary");
64 let bytes = fs::read(path).map_err(|source| Error::Io {
65 path: path.display().to_string(),
66 source,
67 })?;
68 Self::from_bytes(&bytes)
69 }
70
71 pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
73 Self::from_source(ByteSource::Owned(Arc::<[u8]>::from(bytes)))
74 }
75
76 pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
82 Self::from_source(ByteSource::Static(bytes))
83 }
84
85 fn from_source(source: ByteSource) -> Result<Self, Error> {
86 let bytes = source.as_ref();
87 let header = FixedHeader::parse(bytes)?;
88 let metadata_bytes = checked_slice(bytes, header.metadata_offset, header.metadata_len)
89 .ok_or(Error::SectionOutOfBounds {
90 section: "metadata",
91 })?;
92 let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes)
93 .map_err(|source| Error::MetadataDecode { source })?;
94 let fst_bytes = source
95 .section(header.fst_offset, header.fst_len)
96 .ok_or(Error::SectionOutOfBounds { section: "FST" })?;
97 let readings = source
98 .section(header.readings_offset, header.readings_len)
99 .ok_or(Error::SectionOutOfBounds {
100 section: "readings",
101 })?;
102 let map = Map::new(fst_bytes).map_err(|source| Error::FstDecode { source })?;
103 let entry_count = parse_u64_metadata(&metadata, "entry_count")
104 .unwrap_or_else(|| u64::try_from(map.len()).unwrap_or(u64::MAX));
105 let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars")
106 .or_else(|| max_key_chars_from_map(&map));
107
108 tracing::debug!(
109 byte_length = bytes.len(),
110 format_version = FORMAT_VERSION,
111 entry_count,
112 ?max_word_chars,
113 "decoded FST dictionary"
114 );
115 Ok(Self {
116 metadata,
117 map,
118 readings,
119 entry_count,
120 max_word_chars,
121 })
122 }
123
124 pub fn metadata(&self) -> &BTreeMap<String, String> {
126 &self.metadata
127 }
128
129 pub fn entry_count(&self) -> u64 {
131 self.entry_count
132 }
133
134 pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
136 let Some(encoded) = self.map.get(hanja.as_bytes()) else {
137 return Ok(None);
138 };
139 self.decode_entry(encoded).map(Some)
140 }
141
142 fn decode_entry(&self, encoded: u64) -> Result<LookupEntry, Error> {
143 let (reading_len, mark, reading_offset) = decode_value(encoded);
144 let reading_start =
145 usize::try_from(reading_offset).map_err(|_| Error::ValueOutOfRange {
146 field: "reading offset",
147 })?;
148 let reading_end =
149 reading_start
150 .checked_add(usize::from(reading_len))
151 .ok_or(Error::ValueOverflow {
152 field: "reading range",
153 })?;
154 let reading_bytes = self
155 .readings
156 .as_ref()
157 .get(reading_start..reading_end)
158 .ok_or(Error::SectionOutOfBounds {
159 section: "reading table entry",
160 })?;
161 let reading = std::str::from_utf8(reading_bytes)
162 .map_err(|source| Error::InvalidUtf8 {
163 field: "reading",
164 source,
165 })?
166 .to_owned();
167
168 Ok(LookupEntry { reading, mark })
169 }
170}
171
172#[derive(Clone, Debug)]
173enum ByteSource {
174 Owned(Arc<[u8]>),
175 Static(&'static [u8]),
176}
177
178impl ByteSource {
179 fn section(&self, offset: u64, len: u64) -> Option<ByteSection> {
180 let offset = usize::try_from(offset).ok()?;
181 let len = usize::try_from(len).ok()?;
182 let end = offset.checked_add(len)?;
183 (end <= self.as_ref().len()).then(|| ByteSection {
184 source: self.clone(),
185 range: offset..end,
186 })
187 }
188}
189
190impl AsRef<[u8]> for ByteSource {
191 fn as_ref(&self) -> &[u8] {
192 match self {
193 Self::Owned(bytes) => bytes,
194 Self::Static(bytes) => bytes,
195 }
196 }
197}
198
199#[derive(Clone, Debug)]
200struct ByteSection {
201 source: ByteSource,
202 range: Range<usize>,
203}
204
205impl AsRef<[u8]> for ByteSection {
206 fn as_ref(&self) -> &[u8] {
207 &self.source.as_ref()[self.range.clone()]
208 }
209}
210
211impl HanjaDictionary for FstDictionary {
212 fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
213 let mut stream = self
214 .map
215 .search(KeyIsPrefixOf::new(s.as_bytes()))
216 .into_stream();
217 let mut matches = Vec::new();
218 while let Some((key, encoded)) = stream.next() {
219 match self.decode_entry(encoded) {
220 Ok(entry) => {
221 matches.push(Match {
222 byte_len: key.len(),
223 reading: entry.reading,
224 suffix_reading: None,
225 mark: entry.mark,
226 });
227 }
228 Err(error) => {
229 if let Ok(key_str) = std::str::from_utf8(key) {
230 tracing::warn!(key = key_str, error = ?error, "skipping FST entry with undecodable value");
231 } else {
232 tracing::warn!(key_len = key.len(), error = ?error, "skipping FST entry with non-UTF-8 key and undecodable value");
233 }
234 }
235 }
236 }
237 matches.sort_by_key(|matched| matched.byte_len);
238 Box::new(matches.into_iter())
239 }
240
241 fn max_word_chars(&self) -> Option<usize> {
242 self.max_word_chars
243 }
244
245 fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
246 let mut stream = self.map.stream();
247 let mut records = Vec::new();
248 while let Some((key, encoded)) = stream.next() {
249 let Ok(hanja) = std::str::from_utf8(key) else {
250 continue;
251 };
252 if let Ok(entry) = self.decode_entry(encoded) {
253 records.push(DictionaryRecord {
254 hanja: hanja.to_owned(),
255 reading: entry.reading,
256 mark: entry.mark,
257 });
258 }
259 }
260 Some(Box::new(records.into_iter()))
261 }
262
263 fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
264 let mut stream = self.map.stream();
265 while let Some((key, encoded)) = stream.next() {
266 if key == hanja.as_bytes() {
267 continue;
268 }
269 if self
270 .decode_entry(encoded)
271 .is_ok_and(|entry| entry.reading == reading)
272 {
273 return true;
274 }
275 }
276 false
277 }
278}
279
280#[derive(Clone, Debug, Eq, PartialEq)]
282pub struct LookupEntry {
283 reading: String,
284 mark: MatchMark,
285}
286
287impl LookupEntry {
288 pub fn reading(&self) -> &str {
290 &self.reading
291 }
292
293 pub fn mark(&self) -> MatchMark {
295 self.mark
296 }
297}
298
299#[derive(Debug, thiserror::Error)]
301#[non_exhaustive]
302pub enum Error {
303 #[error("failed to read {path}: {source}")]
305 Io {
306 path: String,
308 #[source]
310 source: std::io::Error,
311 },
312
313 #[error("dictionary file is shorter than the fixed header: {actual} bytes")]
315 ShortHeader {
316 actual: usize,
318 },
319
320 #[error("invalid dictionary magic")]
322 InvalidMagic,
323
324 #[error("unsupported dictionary version {version}")]
326 UnsupportedVersion {
327 version: u32,
329 },
330
331 #[error("unsupported dictionary header length {header_len}")]
333 UnsupportedHeaderLength {
334 header_len: u32,
336 },
337
338 #[error("failed to read dictionary header: {source}")]
340 HeaderRead {
341 #[source]
343 source: std::io::Error,
344 },
345
346 #[error("{section} range is outside the file")]
348 SectionOutOfBounds {
349 section: &'static str,
351 },
352
353 #[error("failed to decode dictionary metadata: {source}")]
355 MetadataDecode {
356 #[source]
358 source: ciborium::de::Error<std::io::Error>,
359 },
360
361 #[error("failed to decode FST map: {source}")]
363 FstDecode {
364 #[source]
366 source: fst::Error,
367 },
368
369 #[error("{field} is too large")]
371 ValueOutOfRange {
372 field: &'static str,
374 },
375
376 #[error("{field} overflow")]
378 ValueOverflow {
379 field: &'static str,
381 },
382
383 #[error("{field} contains invalid UTF-8: {source}")]
385 InvalidUtf8 {
386 field: &'static str,
388 #[source]
390 source: std::str::Utf8Error,
391 },
392}
393
394#[derive(Clone, Copy, Debug, Eq, PartialEq)]
395struct FixedHeader {
396 metadata_offset: u64,
397 metadata_len: u64,
398 fst_offset: u64,
399 fst_len: u64,
400 readings_offset: u64,
401 readings_len: u64,
402}
403
404impl FixedHeader {
405 fn parse(bytes: &[u8]) -> Result<Self, Error> {
406 if bytes.len() < FIXED_HEADER_LEN {
407 return Err(Error::ShortHeader {
408 actual: bytes.len(),
409 });
410 }
411 if &bytes[..8] != MAGIC {
412 return Err(Error::InvalidMagic);
413 }
414 let version = read_u32(&bytes[8..12]);
415 if version != FORMAT_VERSION {
416 tracing::error!(
417 version,
418 expected = FORMAT_VERSION,
419 "unsupported FST format version"
420 );
421 return Err(Error::UnsupportedVersion { version });
422 }
423 let header_len = read_u32(&bytes[12..16]);
424 if header_len != FIXED_HEADER_LEN as u32 {
425 return Err(Error::UnsupportedHeaderLength { header_len });
426 }
427 let mut cursor = Cursor::new(&bytes[16..FIXED_HEADER_LEN]);
428 Ok(Self {
429 metadata_offset: read_next_u64(&mut cursor)?,
430 metadata_len: read_next_u64(&mut cursor)?,
431 fst_offset: read_next_u64(&mut cursor)?,
432 fst_len: read_next_u64(&mut cursor)?,
433 readings_offset: read_next_u64(&mut cursor)?,
434 readings_len: read_next_u64(&mut cursor)?,
435 })
436 }
437}
438
439#[derive(Clone, Copy, Debug)]
440struct KeyIsPrefixOf<'a> {
441 bytes: &'a [u8],
442}
443
444impl<'a> KeyIsPrefixOf<'a> {
445 fn new(bytes: &'a [u8]) -> Self {
446 Self { bytes }
447 }
448}
449
450impl Automaton for KeyIsPrefixOf<'_> {
451 type State = Option<usize>;
452
453 fn start(&self) -> Self::State {
454 Some(0)
455 }
456
457 fn is_match(&self, state: &Self::State) -> bool {
458 state.is_some()
459 }
460
461 fn can_match(&self, state: &Self::State) -> bool {
462 state.is_some()
463 }
464
465 fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
466 let position = (*state)?;
467 if self.bytes.get(position).copied() == Some(byte) {
468 Some(position + 1)
469 } else {
470 None
471 }
472 }
473}
474
475fn decode_value(value: u64) -> (u16, MatchMark, u64) {
476 let reading_len = (value & VALUE_READING_LEN_MASK) as u16;
477 let mark = decode_mark(((value >> VALUE_MARK_SHIFT) & 0xff) as u8);
478 let reading_offset = value >> VALUE_OFFSET_SHIFT;
479 (reading_len, mark, reading_offset)
480}
481
482fn decode_mark(encoded: u8) -> MatchMark {
483 MatchMark {
484 require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
485 require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
486 }
487}
488
489fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
490 metadata.get(key).and_then(|value| value.parse().ok())
491}
492
493fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
494 metadata.get(key).and_then(|value| value.parse().ok())
495}
496
497fn max_key_chars_from_map<D>(map: &Map<D>) -> Option<usize>
498where
499 D: AsRef<[u8]>,
500{
501 let mut stream = map.keys();
502 let mut max = None;
503 while let Some(key) = stream.next() {
504 let Ok(key) = std::str::from_utf8(key) else {
505 continue;
506 };
507 let chars = key.chars().count();
508 max = Some(max.map_or(chars, |current: usize| current.max(chars)));
509 }
510 max
511}
512
513fn read_u32(bytes: &[u8]) -> u32 {
514 u32::from_le_bytes(bytes.try_into().expect("slice has exactly four bytes"))
515}
516
517fn read_next_u64(cursor: &mut Cursor<&[u8]>) -> Result<u64, Error> {
518 let mut bytes = [0; 8];
519 cursor
520 .read_exact(&mut bytes)
521 .map_err(|source| Error::HeaderRead { source })?;
522 Ok(u64::from_le_bytes(bytes))
523}
524
525fn checked_slice(bytes: &[u8], offset: u64, len: u64) -> Option<&[u8]> {
526 let offset = usize::try_from(offset).ok()?;
527 let len = usize::try_from(len).ok()?;
528 bytes.get(offset..offset.checked_add(len)?)
529}
530
531#[cfg(test)]
532mod tests {
533 use std::collections::BTreeMap;
534 use std::fs;
535
536 use ciborium::ser::into_writer;
537 use fst::MapBuilder;
538 use gukhanmun_core::{MapDictionary, RenderMode, convert_plain_text};
539 use proptest::prelude::*;
540 use tempfile::tempdir;
541 use tracing_test::traced_test;
542
543 use super::{FstDictionary, HanjaDictionary, MatchMark};
544
545 const MAGIC: &[u8; 8] = b"GUKHMFST";
546 const FORMAT_VERSION: u32 = 1;
547 const FIXED_HEADER_LEN: usize = 64;
548 const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
549 const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
550 const VALUE_MARK_SHIFT: u64 = 16;
551 const VALUE_OFFSET_SHIFT: u64 = 24;
552
553 #[test]
554 fn loads_valid_bytes_metadata_and_lookup() {
555 let bytes = fixture_bytes(&[
556 entry("天地", "천지", false, false),
557 entry("漢字", "한자", true, false),
558 entry("色깔論", "색깔론", false, true),
559 ]);
560
561 let dictionary = FstDictionary::from_bytes(&bytes).unwrap();
562
563 assert_eq!(dictionary.entry_count(), 3);
564 assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
565 assert_eq!(dictionary.max_word_chars(), Some(3));
566 let hanja = dictionary.lookup("漢字").unwrap().unwrap();
567 assert_eq!(hanja.reading(), "한자");
568 assert!(hanja.mark().require_hanja);
569 assert!(!hanja.mark().require_hangul);
570 let mixed = dictionary.lookup("色깔論").unwrap().unwrap();
571 assert_eq!(mixed.reading(), "색깔론");
572 assert!(!mixed.mark().require_hanja);
573 assert!(mixed.mark().require_hangul);
574 }
575
576 #[test]
577 fn open_reads_a_dictionary_file() {
578 let temp = tempdir().unwrap();
579 let path = temp.path().join("dict.gukfst");
580 fs::write(&path, fixture_bytes(&[entry("天地", "천지", false, false)])).unwrap();
581
582 let dictionary = FstDictionary::open(&path).unwrap();
583
584 assert_eq!(
585 dictionary.lookup("天地").unwrap().unwrap().reading(),
586 "천지"
587 );
588 }
589
590 #[test]
591 fn from_static_bytes_matches_owned_loading() {
592 let bytes = fixture_bytes(&[
593 entry("天地", "천지", false, false),
594 entry("漢字", "한자", true, false),
595 entry("色깔論", "색깔론", false, true),
596 ]);
597 let static_bytes = Box::leak(bytes.clone().into_boxed_slice());
598 let owned = FstDictionary::from_bytes(&bytes).unwrap();
599 let static_dict = FstDictionary::from_static_bytes(static_bytes).unwrap();
600
601 assert_equivalent_dictionaries(&owned, &static_dict);
602 }
603
604 #[test]
605 fn dictionary_is_send_sync() {
606 fn assert_send_sync<T: Send + Sync>() {}
607
608 assert_send_sync::<FstDictionary>();
609 }
610
611 #[traced_test]
612 #[test]
613 fn unsupported_version_emits_error_event() {
614 let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
615 let mut bad_version = valid.clone();
616 bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
617
618 let result = FstDictionary::from_bytes(&bad_version);
619
620 assert!(matches!(
621 result.unwrap_err(),
622 super::Error::UnsupportedVersion { version: 999 }
623 ));
624 assert!(logs_contain("unsupported FST format version"));
625 }
626
627 #[test]
628 fn rejects_malformed_headers() {
629 let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
630 let mut bad_magic = valid.clone();
631 bad_magic[0] = b'X';
632 assert!(matches!(
633 FstDictionary::from_bytes(&bad_magic).unwrap_err(),
634 super::Error::InvalidMagic
635 ));
636
637 let mut bad_version = valid.clone();
638 bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
639 assert!(matches!(
640 FstDictionary::from_bytes(&bad_version).unwrap_err(),
641 super::Error::UnsupportedVersion { version: 999 }
642 ));
643
644 let truncated = &valid[..valid.len() - 1];
645 assert!(matches!(
646 FstDictionary::from_bytes(truncated).unwrap_err(),
647 super::Error::SectionOutOfBounds {
648 section: "readings"
649 }
650 ));
651 }
652
653 #[test]
654 fn decode_errors_preserve_structured_variants_and_sources() {
655 let mut invalid_metadata = fixture_bytes(&[entry("天地", "천지", false, false)]);
656 let metadata_offset = FIXED_HEADER_LEN;
657 invalid_metadata[metadata_offset] = 0xff;
658 let metadata_error = FstDictionary::from_bytes(&invalid_metadata).unwrap_err();
659 assert!(matches!(
660 metadata_error,
661 super::Error::MetadataDecode { .. }
662 ));
663 assert!(std::error::Error::source(&metadata_error).is_some());
664
665 let mut invalid_reading = fixture_bytes(&[entry("天地", "천지", false, false)]);
666 *invalid_reading.last_mut().unwrap() = 0xff;
667 let dictionary = FstDictionary::from_bytes(&invalid_reading).unwrap();
668 let utf8_error = dictionary.lookup("天地").unwrap_err();
669 assert!(matches!(
670 utf8_error,
671 super::Error::InvalidUtf8 {
672 field: "reading",
673 ..
674 }
675 ));
676 assert!(std::error::Error::source(&utf8_error).is_some());
677 }
678
679 #[test]
680 fn matches_at_returns_every_prefix_match() {
681 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
682 entry("行事", "행사", false, false),
683 entry("行事場", "행사장", false, false),
684 entry("場所", "장소", false, false),
685 ]))
686 .unwrap();
687
688 let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
689
690 assert_eq!(matches.len(), 2);
691 assert_eq!(matches[0].byte_len, "行事".len());
692 assert_eq!(matches[0].reading, "행사");
693 assert_eq!(matches[1].byte_len, "行事場".len());
694 assert_eq!(matches[1].reading, "행사장");
695 }
696
697 #[test]
698 fn has_homophone_detects_other_forms_with_same_reading() {
699 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
700 entry("漢字", "한자", false, false),
701 entry("翰字", "한자", false, false),
702 entry("天地", "천지", false, false),
703 ]))
704 .unwrap();
705
706 assert!(dictionary.has_homophone("漢字", "한자"));
707 assert!(!dictionary.has_homophone("天地", "천지"));
708 }
709
710 #[test]
711 fn lattice_regressions_pass_with_fst_backend() {
712 let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
713 entry("行事", "행사", false, false),
714 entry("行事場", "행사장", false, false),
715 entry("場所", "장소", false, false),
716 entry("入口", "입구", false, false),
717 entry("汽車길", "기찻길", false, false),
718 ]))
719 .unwrap();
720
721 assert_eq!(
722 convert_plain_text("行事場入口", &dictionary, RenderMode::HangulHanjaParens),
723 "행사장(行事場)입구(入口)"
724 );
725 assert_eq!(
726 convert_plain_text("行事場所", &dictionary, RenderMode::HangulHanjaParens),
727 "행사(行事)장소(場所)"
728 );
729 assert_eq!(
730 convert_plain_text("汽車길", &dictionary, RenderMode::HangulHanjaParens),
731 "기찻길(汽車길)"
732 );
733 }
734
735 proptest! {
736 #[test]
737 fn generated_fst_matches_map_dictionary(entries in unique_entries()) {
738 let bytes = fixture_bytes(
739 &entries
740 .iter()
741 .map(|(hanja, reading, require_hanja, require_hangul)| {
742 entry(hanja, reading, *require_hanja, *require_hangul)
743 })
744 .collect::<Vec<_>>()
745 );
746 let fst = FstDictionary::from_bytes(&bytes).unwrap();
747 let mut map = MapDictionary::new();
748 for (hanja, reading, require_hanja, require_hangul) in entries {
749 map.insert_marked(
750 &hanja,
751 &reading,
752 MatchMark {
753 require_hanja,
754 require_hangul,
755 },
756 );
757 let fst_matches = fst.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758 let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
759 prop_assert_eq!(fst_matches, map_matches);
760 let lookup = fst.lookup(&hanja).unwrap().unwrap();
761 prop_assert_eq!(lookup.reading(), reading.as_str());
762 }
763 }
764 }
765
766 #[derive(Clone, Copy, Debug)]
767 struct TestEntry<'a> {
768 hanja: &'a str,
769 reading: &'a str,
770 mark: MatchMark,
771 }
772
773 fn entry<'a>(
774 hanja: &'a str,
775 reading: &'a str,
776 require_hanja: bool,
777 require_hangul: bool,
778 ) -> TestEntry<'a> {
779 TestEntry {
780 hanja,
781 reading,
782 mark: MatchMark {
783 require_hanja,
784 require_hangul,
785 },
786 }
787 }
788
789 fn fixture_bytes(entries: &[TestEntry<'_>]) -> Vec<u8> {
790 let mut metadata = BTreeMap::new();
791 metadata.insert("source".to_owned(), "fixture".to_owned());
792 metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
793 metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
794 metadata.insert("entry_count".to_owned(), entries.len().to_string());
795 metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
796 metadata.insert(
797 "max_word_chars".to_owned(),
798 entries
799 .iter()
800 .map(|entry| entry.hanja.chars().count())
801 .max()
802 .unwrap_or(0)
803 .to_string(),
804 );
805 metadata.insert(
806 "max_key_bytes".to_owned(),
807 entries
808 .iter()
809 .map(|entry| entry.hanja.len())
810 .max()
811 .unwrap_or(0)
812 .to_string(),
813 );
814 let mut metadata_bytes = Vec::new();
815 into_writer(&metadata, &mut metadata_bytes).unwrap();
816
817 let mut readings = Vec::new();
818 let mut builder = MapBuilder::memory();
819 let mut sorted = entries.to_vec();
820 sorted.sort_by(|left, right| left.hanja.cmp(right.hanja));
821 for entry in sorted {
822 let reading_offset = readings.len() as u64;
823 let value = (entry.reading.len() as u64)
824 | (u64::from(encode_mark(entry.mark)) << VALUE_MARK_SHIFT)
825 | (reading_offset << VALUE_OFFSET_SHIFT);
826 builder.insert(entry.hanja.as_bytes(), value).unwrap();
827 readings.extend_from_slice(entry.reading.as_bytes());
828 }
829 let fst_bytes = builder.into_inner().unwrap();
830
831 let metadata_offset = FIXED_HEADER_LEN as u64;
832 let fst_offset = metadata_offset + metadata_bytes.len() as u64;
833 let readings_offset = fst_offset + fst_bytes.len() as u64;
834 let mut output = Vec::new();
835 output.extend_from_slice(MAGIC);
836 output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
837 output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
838 output.extend_from_slice(&metadata_offset.to_le_bytes());
839 output.extend_from_slice(&(metadata_bytes.len() as u64).to_le_bytes());
840 output.extend_from_slice(&fst_offset.to_le_bytes());
841 output.extend_from_slice(&(fst_bytes.len() as u64).to_le_bytes());
842 output.extend_from_slice(&readings_offset.to_le_bytes());
843 output.extend_from_slice(&(readings.len() as u64).to_le_bytes());
844 output.extend(metadata_bytes);
845 output.extend(fst_bytes);
846 output.extend(readings);
847 output
848 }
849
850 fn encode_mark(mark: MatchMark) -> u8 {
851 let mut encoded = 0;
852 if mark.require_hanja {
853 encoded |= MARK_REQUIRE_HANJA;
854 }
855 if mark.require_hangul {
856 encoded |= MARK_REQUIRE_HANGUL;
857 }
858 encoded
859 }
860
861 fn assert_equivalent_dictionaries(left: &FstDictionary, right: &FstDictionary) {
862 assert_eq!(left.metadata(), right.metadata());
863 assert_eq!(left.entry_count(), right.entry_count());
864 assert_eq!(left.max_word_chars(), right.max_word_chars());
865 for key in ["天地", "漢字", "色깔論"] {
866 assert_eq!(left.lookup(key).unwrap(), right.lookup(key).unwrap());
867 }
868 assert_eq!(
869 left.matches_at("色깔論이다").collect::<Vec<_>>(),
870 right.matches_at("色깔論이다").collect::<Vec<_>>()
871 );
872 assert_eq!(
873 left.entries().unwrap().collect::<Vec<_>>(),
874 right.entries().unwrap().collect::<Vec<_>>()
875 );
876 assert_eq!(
877 left.has_homophone("漢字", "한자"),
878 right.has_homophone("漢字", "한자")
879 );
880 }
881
882 fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
883 proptest::collection::btree_map(
884 "[一-龥]{1,3}",
885 ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
886 1..16,
887 )
888 .prop_map(|entries| {
889 entries
890 .into_iter()
891 .map(|(hanja, (reading, require_hanja, require_hangul))| {
892 (hanja, reading, require_hanja, require_hangul)
893 })
894 .collect()
895 })
896 }
897}