Skip to main content

gukhanmun_fst/
lib.rs

1// Gukhanmun: FST dictionary backend for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! FST dictionary backend for Gukhanmun.
18
19#![deny(missing_docs)]
20#![deny(unsafe_code)]
21#![deny(unsafe_op_in_unsafe_fn)]
22
23use std::collections::BTreeMap;
24use std::fs;
25use std::io::{Cursor, Read};
26use std::ops::Range;
27use std::path::Path;
28use std::sync::Arc;
29
30use ciborium::de::from_reader;
31use fst::automaton::Automaton;
32use fst::{IntoStreamer, Map, Streamer};
33use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
34
35const MAGIC: &[u8; 8] = b"GUKHMFST";
36const FORMAT_VERSION: u32 = 1;
37const FIXED_HEADER_LEN: usize = 64;
38const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
39const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
40const VALUE_READING_LEN_MASK: u64 = 0xffff;
41const VALUE_MARK_SHIFT: u64 = 16;
42const VALUE_OFFSET_SHIFT: u64 = 24;
43
44/// Dictionary backed by a Gukhanmun FST file.
45///
46/// The fixed header and CBOR metadata are decoded eagerly.  The FST map bytes
47/// and reading table share one backing byte source: owned heap bytes for
48/// [`FstDictionary::open`] and [`FstDictionary::from_bytes`], or static bytes
49/// for [`FstDictionary::from_static_bytes`].
50#[derive(Clone, Debug)]
51pub struct FstDictionary {
52    metadata: BTreeMap<String, String>,
53    map: Map<ByteSection>,
54    readings: ByteSection,
55    entry_count: u64,
56    max_word_chars: Option<usize>,
57}
58
59impl FstDictionary {
60    /// Opens a dictionary file from disk.
61    pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
62        let path = path.as_ref();
63        tracing::info!(path = %path.display(), "opening FST dictionary");
64        let bytes = fs::read(path).map_err(|source| Error::Io {
65            path: path.display().to_string(),
66            source,
67        })?;
68        Self::from_bytes(&bytes)
69    }
70
71    /// Decodes a dictionary from bytes in the Gukhanmun FST file format.
72    pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
73        Self::from_source(ByteSource::Owned(Arc::<[u8]>::from(bytes)))
74    }
75
76    /// Decodes a dictionary from static bytes in the Gukhanmun FST file format.
77    ///
78    /// This is intended for embedded dictionaries built with `include_bytes!`.
79    /// The FST map and reading table borrow from the static byte slice without
80    /// copying either section.
81    pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
82        Self::from_source(ByteSource::Static(bytes))
83    }
84
85    fn from_source(source: ByteSource) -> Result<Self, Error> {
86        let bytes = source.as_ref();
87        let header = FixedHeader::parse(bytes)?;
88        let metadata_bytes = checked_slice(bytes, header.metadata_offset, header.metadata_len)
89            .ok_or(Error::SectionOutOfBounds {
90                section: "metadata",
91            })?;
92        let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes)
93            .map_err(|source| Error::MetadataDecode { source })?;
94        let fst_bytes = source
95            .section(header.fst_offset, header.fst_len)
96            .ok_or(Error::SectionOutOfBounds { section: "FST" })?;
97        let readings = source
98            .section(header.readings_offset, header.readings_len)
99            .ok_or(Error::SectionOutOfBounds {
100                section: "readings",
101            })?;
102        let map = Map::new(fst_bytes).map_err(|source| Error::FstDecode { source })?;
103        let entry_count = parse_u64_metadata(&metadata, "entry_count")
104            .unwrap_or_else(|| u64::try_from(map.len()).unwrap_or(u64::MAX));
105        let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars")
106            .or_else(|| max_key_chars_from_map(&map));
107
108        tracing::debug!(
109            byte_length = bytes.len(),
110            format_version = FORMAT_VERSION,
111            entry_count,
112            ?max_word_chars,
113            "decoded FST dictionary"
114        );
115        Ok(Self {
116            metadata,
117            map,
118            readings,
119            entry_count,
120            max_word_chars,
121        })
122    }
123
124    /// Returns build metadata embedded in the dictionary file.
125    pub fn metadata(&self) -> &BTreeMap<String, String> {
126        &self.metadata
127    }
128
129    /// Returns the number of entries recorded at build time.
130    pub fn entry_count(&self) -> u64 {
131        self.entry_count
132    }
133
134    /// Returns the exact dictionary entry for `hanja`, if present.
135    pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
136        let Some(encoded) = self.map.get(hanja.as_bytes()) else {
137            return Ok(None);
138        };
139        self.decode_entry(encoded).map(Some)
140    }
141
142    fn decode_entry(&self, encoded: u64) -> Result<LookupEntry, Error> {
143        let (reading_len, mark, reading_offset) = decode_value(encoded);
144        let reading_start =
145            usize::try_from(reading_offset).map_err(|_| Error::ValueOutOfRange {
146                field: "reading offset",
147            })?;
148        let reading_end =
149            reading_start
150                .checked_add(usize::from(reading_len))
151                .ok_or(Error::ValueOverflow {
152                    field: "reading range",
153                })?;
154        let reading_bytes = self
155            .readings
156            .as_ref()
157            .get(reading_start..reading_end)
158            .ok_or(Error::SectionOutOfBounds {
159                section: "reading table entry",
160            })?;
161        let reading = std::str::from_utf8(reading_bytes)
162            .map_err(|source| Error::InvalidUtf8 {
163                field: "reading",
164                source,
165            })?
166            .to_owned();
167
168        Ok(LookupEntry { reading, mark })
169    }
170}
171
172#[derive(Clone, Debug)]
173enum ByteSource {
174    Owned(Arc<[u8]>),
175    Static(&'static [u8]),
176}
177
178impl ByteSource {
179    fn section(&self, offset: u64, len: u64) -> Option<ByteSection> {
180        let offset = usize::try_from(offset).ok()?;
181        let len = usize::try_from(len).ok()?;
182        let end = offset.checked_add(len)?;
183        (end <= self.as_ref().len()).then(|| ByteSection {
184            source: self.clone(),
185            range: offset..end,
186        })
187    }
188}
189
190impl AsRef<[u8]> for ByteSource {
191    fn as_ref(&self) -> &[u8] {
192        match self {
193            Self::Owned(bytes) => bytes,
194            Self::Static(bytes) => bytes,
195        }
196    }
197}
198
199#[derive(Clone, Debug)]
200struct ByteSection {
201    source: ByteSource,
202    range: Range<usize>,
203}
204
205impl AsRef<[u8]> for ByteSection {
206    fn as_ref(&self) -> &[u8] {
207        &self.source.as_ref()[self.range.clone()]
208    }
209}
210
211impl HanjaDictionary for FstDictionary {
212    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
213        let mut stream = self
214            .map
215            .search(KeyIsPrefixOf::new(s.as_bytes()))
216            .into_stream();
217        let mut matches = Vec::new();
218        while let Some((key, encoded)) = stream.next() {
219            match self.decode_entry(encoded) {
220                Ok(entry) => {
221                    matches.push(Match {
222                        byte_len: key.len(),
223                        reading: entry.reading,
224                        mark: entry.mark,
225                    });
226                }
227                Err(error) => {
228                    if let Ok(key_str) = std::str::from_utf8(key) {
229                        tracing::warn!(key = key_str, error = ?error, "skipping FST entry with undecodable value");
230                    } else {
231                        tracing::warn!(key_len = key.len(), error = ?error, "skipping FST entry with non-UTF-8 key and undecodable value");
232                    }
233                }
234            }
235        }
236        matches.sort_by_key(|matched| matched.byte_len);
237        Box::new(matches.into_iter())
238    }
239
240    fn max_word_chars(&self) -> Option<usize> {
241        self.max_word_chars
242    }
243
244    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
245        let mut stream = self.map.stream();
246        let mut records = Vec::new();
247        while let Some((key, encoded)) = stream.next() {
248            let Ok(hanja) = std::str::from_utf8(key) else {
249                continue;
250            };
251            if let Ok(entry) = self.decode_entry(encoded) {
252                records.push(DictionaryRecord {
253                    hanja: hanja.to_owned(),
254                    reading: entry.reading,
255                    mark: entry.mark,
256                });
257            }
258        }
259        Some(Box::new(records.into_iter()))
260    }
261
262    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
263        let mut stream = self.map.stream();
264        while let Some((key, encoded)) = stream.next() {
265            if key == hanja.as_bytes() {
266                continue;
267            }
268            if self
269                .decode_entry(encoded)
270                .is_ok_and(|entry| entry.reading == reading)
271            {
272                return true;
273            }
274        }
275        false
276    }
277}
278
279/// A decoded exact-match dictionary entry.
280#[derive(Clone, Debug, Eq, PartialEq)]
281pub struct LookupEntry {
282    reading: String,
283    mark: MatchMark,
284}
285
286impl LookupEntry {
287    /// Returns the hangul reading for the entry.
288    pub fn reading(&self) -> &str {
289        &self.reading
290    }
291
292    /// Returns dictionary-provided rendering constraints.
293    pub fn mark(&self) -> MatchMark {
294        self.mark
295    }
296}
297
298/// Error returned while opening or decoding an FST dictionary.
299#[derive(Debug, thiserror::Error)]
300#[non_exhaustive]
301pub enum Error {
302    /// Reading a dictionary file from disk failed.
303    #[error("failed to read {path}: {source}")]
304    Io {
305        /// Path that failed to open or read.
306        path: String,
307        /// Underlying I/O error.
308        #[source]
309        source: std::io::Error,
310    },
311
312    /// The input is shorter than the fixed FST header.
313    #[error("dictionary file is shorter than the fixed header: {actual} bytes")]
314    ShortHeader {
315        /// Number of bytes supplied by the caller.
316        actual: usize,
317    },
318
319    /// The fixed header magic bytes do not identify a Gukhanmun FST file.
320    #[error("invalid dictionary magic")]
321    InvalidMagic,
322
323    /// The file format version is not supported by this crate.
324    #[error("unsupported dictionary version {version}")]
325    UnsupportedVersion {
326        /// Version read from the fixed header.
327        version: u32,
328    },
329
330    /// The fixed header length field is not supported.
331    #[error("unsupported dictionary header length {header_len}")]
332    UnsupportedHeaderLength {
333        /// Header length read from the fixed header.
334        header_len: u32,
335    },
336
337    /// Reading the fixed header failed.
338    #[error("failed to read dictionary header: {source}")]
339    HeaderRead {
340        /// Underlying read error.
341        #[source]
342        source: std::io::Error,
343    },
344
345    /// A section range from the header points outside the file.
346    #[error("{section} range is outside the file")]
347    SectionOutOfBounds {
348        /// Name of the section that was out of bounds.
349        section: &'static str,
350    },
351
352    /// CBOR metadata could not be decoded.
353    #[error("failed to decode dictionary metadata: {source}")]
354    MetadataDecode {
355        /// Underlying CBOR decode error.
356        #[source]
357        source: ciborium::de::Error<std::io::Error>,
358    },
359
360    /// The embedded FST map could not be decoded.
361    #[error("failed to decode FST map: {source}")]
362    FstDecode {
363        /// Underlying FST decode error.
364        #[source]
365        source: fst::Error,
366    },
367
368    /// A packed FST value did not fit the host representation.
369    #[error("{field} is too large")]
370    ValueOutOfRange {
371        /// Field that exceeded its valid range.
372        field: &'static str,
373    },
374
375    /// A packed FST value overflowed while computing a range.
376    #[error("{field} overflow")]
377    ValueOverflow {
378        /// Field that overflowed.
379        field: &'static str,
380    },
381
382    /// A UTF-8 string field was invalid.
383    #[error("{field} contains invalid UTF-8: {source}")]
384    InvalidUtf8 {
385        /// Field that contained invalid UTF-8.
386        field: &'static str,
387        /// Underlying UTF-8 error.
388        #[source]
389        source: std::str::Utf8Error,
390    },
391}
392
393#[derive(Clone, Copy, Debug, Eq, PartialEq)]
394struct FixedHeader {
395    metadata_offset: u64,
396    metadata_len: u64,
397    fst_offset: u64,
398    fst_len: u64,
399    readings_offset: u64,
400    readings_len: u64,
401}
402
403impl FixedHeader {
404    fn parse(bytes: &[u8]) -> Result<Self, Error> {
405        if bytes.len() < FIXED_HEADER_LEN {
406            return Err(Error::ShortHeader {
407                actual: bytes.len(),
408            });
409        }
410        if &bytes[..8] != MAGIC {
411            return Err(Error::InvalidMagic);
412        }
413        let version = read_u32(&bytes[8..12]);
414        if version != FORMAT_VERSION {
415            tracing::error!(
416                version,
417                expected = FORMAT_VERSION,
418                "unsupported FST format version"
419            );
420            return Err(Error::UnsupportedVersion { version });
421        }
422        let header_len = read_u32(&bytes[12..16]);
423        if header_len != FIXED_HEADER_LEN as u32 {
424            return Err(Error::UnsupportedHeaderLength { header_len });
425        }
426        let mut cursor = Cursor::new(&bytes[16..FIXED_HEADER_LEN]);
427        Ok(Self {
428            metadata_offset: read_next_u64(&mut cursor)?,
429            metadata_len: read_next_u64(&mut cursor)?,
430            fst_offset: read_next_u64(&mut cursor)?,
431            fst_len: read_next_u64(&mut cursor)?,
432            readings_offset: read_next_u64(&mut cursor)?,
433            readings_len: read_next_u64(&mut cursor)?,
434        })
435    }
436}
437
438#[derive(Clone, Copy, Debug)]
439struct KeyIsPrefixOf<'a> {
440    bytes: &'a [u8],
441}
442
443impl<'a> KeyIsPrefixOf<'a> {
444    fn new(bytes: &'a [u8]) -> Self {
445        Self { bytes }
446    }
447}
448
449impl Automaton for KeyIsPrefixOf<'_> {
450    type State = Option<usize>;
451
452    fn start(&self) -> Self::State {
453        Some(0)
454    }
455
456    fn is_match(&self, state: &Self::State) -> bool {
457        state.is_some()
458    }
459
460    fn can_match(&self, state: &Self::State) -> bool {
461        state.is_some()
462    }
463
464    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
465        let position = (*state)?;
466        if self.bytes.get(position).copied() == Some(byte) {
467            Some(position + 1)
468        } else {
469            None
470        }
471    }
472}
473
474fn decode_value(value: u64) -> (u16, MatchMark, u64) {
475    let reading_len = (value & VALUE_READING_LEN_MASK) as u16;
476    let mark = decode_mark(((value >> VALUE_MARK_SHIFT) & 0xff) as u8);
477    let reading_offset = value >> VALUE_OFFSET_SHIFT;
478    (reading_len, mark, reading_offset)
479}
480
481fn decode_mark(encoded: u8) -> MatchMark {
482    MatchMark {
483        require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
484        require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
485    }
486}
487
488fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
489    metadata.get(key).and_then(|value| value.parse().ok())
490}
491
492fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
493    metadata.get(key).and_then(|value| value.parse().ok())
494}
495
496fn max_key_chars_from_map<D>(map: &Map<D>) -> Option<usize>
497where
498    D: AsRef<[u8]>,
499{
500    let mut stream = map.keys();
501    let mut max = None;
502    while let Some(key) = stream.next() {
503        let Ok(key) = std::str::from_utf8(key) else {
504            continue;
505        };
506        let chars = key.chars().count();
507        max = Some(max.map_or(chars, |current: usize| current.max(chars)));
508    }
509    max
510}
511
512fn read_u32(bytes: &[u8]) -> u32 {
513    u32::from_le_bytes(bytes.try_into().expect("slice has exactly four bytes"))
514}
515
516fn read_next_u64(cursor: &mut Cursor<&[u8]>) -> Result<u64, Error> {
517    let mut bytes = [0; 8];
518    cursor
519        .read_exact(&mut bytes)
520        .map_err(|source| Error::HeaderRead { source })?;
521    Ok(u64::from_le_bytes(bytes))
522}
523
524fn checked_slice(bytes: &[u8], offset: u64, len: u64) -> Option<&[u8]> {
525    let offset = usize::try_from(offset).ok()?;
526    let len = usize::try_from(len).ok()?;
527    bytes.get(offset..offset.checked_add(len)?)
528}
529
530#[cfg(test)]
531mod tests {
532    use std::collections::BTreeMap;
533    use std::fs;
534
535    use ciborium::ser::into_writer;
536    use fst::MapBuilder;
537    use gukhanmun_core::{MapDictionary, RenderMode, convert_plain_text};
538    use proptest::prelude::*;
539    use tempfile::tempdir;
540    use tracing_test::traced_test;
541
542    use super::{FstDictionary, HanjaDictionary, MatchMark};
543
544    const MAGIC: &[u8; 8] = b"GUKHMFST";
545    const FORMAT_VERSION: u32 = 1;
546    const FIXED_HEADER_LEN: usize = 64;
547    const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
548    const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
549    const VALUE_MARK_SHIFT: u64 = 16;
550    const VALUE_OFFSET_SHIFT: u64 = 24;
551
552    #[test]
553    fn loads_valid_bytes_metadata_and_lookup() {
554        let bytes = fixture_bytes(&[
555            entry("天地", "천지", false, false),
556            entry("漢字", "한자", true, false),
557            entry("色깔論", "색깔론", false, true),
558        ]);
559
560        let dictionary = FstDictionary::from_bytes(&bytes).unwrap();
561
562        assert_eq!(dictionary.entry_count(), 3);
563        assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
564        assert_eq!(dictionary.max_word_chars(), Some(3));
565        let hanja = dictionary.lookup("漢字").unwrap().unwrap();
566        assert_eq!(hanja.reading(), "한자");
567        assert!(hanja.mark().require_hanja);
568        assert!(!hanja.mark().require_hangul);
569        let mixed = dictionary.lookup("色깔論").unwrap().unwrap();
570        assert_eq!(mixed.reading(), "색깔론");
571        assert!(!mixed.mark().require_hanja);
572        assert!(mixed.mark().require_hangul);
573    }
574
575    #[test]
576    fn open_reads_a_dictionary_file() {
577        let temp = tempdir().unwrap();
578        let path = temp.path().join("dict.gukfst");
579        fs::write(&path, fixture_bytes(&[entry("天地", "천지", false, false)])).unwrap();
580
581        let dictionary = FstDictionary::open(&path).unwrap();
582
583        assert_eq!(
584            dictionary.lookup("天地").unwrap().unwrap().reading(),
585            "천지"
586        );
587    }
588
589    #[test]
590    fn from_static_bytes_matches_owned_loading() {
591        let bytes = fixture_bytes(&[
592            entry("天地", "천지", false, false),
593            entry("漢字", "한자", true, false),
594            entry("色깔論", "색깔론", false, true),
595        ]);
596        let static_bytes = Box::leak(bytes.clone().into_boxed_slice());
597        let owned = FstDictionary::from_bytes(&bytes).unwrap();
598        let static_dict = FstDictionary::from_static_bytes(static_bytes).unwrap();
599
600        assert_equivalent_dictionaries(&owned, &static_dict);
601    }
602
603    #[test]
604    fn dictionary_is_send_sync() {
605        fn assert_send_sync<T: Send + Sync>() {}
606
607        assert_send_sync::<FstDictionary>();
608    }
609
610    #[traced_test]
611    #[test]
612    fn unsupported_version_emits_error_event() {
613        let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
614        let mut bad_version = valid.clone();
615        bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
616
617        let result = FstDictionary::from_bytes(&bad_version);
618
619        assert!(matches!(
620            result.unwrap_err(),
621            super::Error::UnsupportedVersion { version: 999 }
622        ));
623        assert!(logs_contain("unsupported FST format version"));
624    }
625
626    #[test]
627    fn rejects_malformed_headers() {
628        let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
629        let mut bad_magic = valid.clone();
630        bad_magic[0] = b'X';
631        assert!(matches!(
632            FstDictionary::from_bytes(&bad_magic).unwrap_err(),
633            super::Error::InvalidMagic
634        ));
635
636        let mut bad_version = valid.clone();
637        bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
638        assert!(matches!(
639            FstDictionary::from_bytes(&bad_version).unwrap_err(),
640            super::Error::UnsupportedVersion { version: 999 }
641        ));
642
643        let truncated = &valid[..valid.len() - 1];
644        assert!(matches!(
645            FstDictionary::from_bytes(truncated).unwrap_err(),
646            super::Error::SectionOutOfBounds {
647                section: "readings"
648            }
649        ));
650    }
651
652    #[test]
653    fn decode_errors_preserve_structured_variants_and_sources() {
654        let mut invalid_metadata = fixture_bytes(&[entry("天地", "천지", false, false)]);
655        let metadata_offset = FIXED_HEADER_LEN;
656        invalid_metadata[metadata_offset] = 0xff;
657        let metadata_error = FstDictionary::from_bytes(&invalid_metadata).unwrap_err();
658        assert!(matches!(
659            metadata_error,
660            super::Error::MetadataDecode { .. }
661        ));
662        assert!(std::error::Error::source(&metadata_error).is_some());
663
664        let mut invalid_reading = fixture_bytes(&[entry("天地", "천지", false, false)]);
665        *invalid_reading.last_mut().unwrap() = 0xff;
666        let dictionary = FstDictionary::from_bytes(&invalid_reading).unwrap();
667        let utf8_error = dictionary.lookup("天地").unwrap_err();
668        assert!(matches!(
669            utf8_error,
670            super::Error::InvalidUtf8 {
671                field: "reading",
672                ..
673            }
674        ));
675        assert!(std::error::Error::source(&utf8_error).is_some());
676    }
677
678    #[test]
679    fn matches_at_returns_every_prefix_match() {
680        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
681            entry("行事", "행사", false, false),
682            entry("行事場", "행사장", false, false),
683            entry("場所", "장소", false, false),
684        ]))
685        .unwrap();
686
687        let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
688
689        assert_eq!(matches.len(), 2);
690        assert_eq!(matches[0].byte_len, "行事".len());
691        assert_eq!(matches[0].reading, "행사");
692        assert_eq!(matches[1].byte_len, "行事場".len());
693        assert_eq!(matches[1].reading, "행사장");
694    }
695
696    #[test]
697    fn has_homophone_detects_other_forms_with_same_reading() {
698        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
699            entry("漢字", "한자", false, false),
700            entry("翰字", "한자", false, false),
701            entry("天地", "천지", false, false),
702        ]))
703        .unwrap();
704
705        assert!(dictionary.has_homophone("漢字", "한자"));
706        assert!(!dictionary.has_homophone("天地", "천지"));
707    }
708
709    #[test]
710    fn lattice_regressions_pass_with_fst_backend() {
711        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
712            entry("行事", "행사", false, false),
713            entry("行事場", "행사장", false, false),
714            entry("場所", "장소", false, false),
715            entry("入口", "입구", false, false),
716            entry("汽車길", "기찻길", false, false),
717        ]))
718        .unwrap();
719
720        assert_eq!(
721            convert_plain_text("行事場入口", &dictionary, RenderMode::HangulHanjaParens),
722            "행사장(行事場)입구(入口)"
723        );
724        assert_eq!(
725            convert_plain_text("行事場所", &dictionary, RenderMode::HangulHanjaParens),
726            "행사(行事)장소(場所)"
727        );
728        assert_eq!(
729            convert_plain_text("汽車길", &dictionary, RenderMode::HangulHanjaParens),
730            "기찻길(汽車길)"
731        );
732    }
733
734    proptest! {
735        #[test]
736        fn generated_fst_matches_map_dictionary(entries in unique_entries()) {
737            let bytes = fixture_bytes(
738                &entries
739                    .iter()
740                    .map(|(hanja, reading, require_hanja, require_hangul)| {
741                        entry(hanja, reading, *require_hanja, *require_hangul)
742                    })
743                    .collect::<Vec<_>>()
744            );
745            let fst = FstDictionary::from_bytes(&bytes).unwrap();
746            let mut map = MapDictionary::new();
747            for (hanja, reading, require_hanja, require_hangul) in entries {
748                map.insert_marked(
749                    &hanja,
750                    &reading,
751                    MatchMark {
752                        require_hanja,
753                        require_hangul,
754                    },
755                );
756                let fst_matches = fst.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
757                let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758                prop_assert_eq!(fst_matches, map_matches);
759                let lookup = fst.lookup(&hanja).unwrap().unwrap();
760                prop_assert_eq!(lookup.reading(), reading.as_str());
761            }
762        }
763    }
764
765    #[derive(Clone, Copy, Debug)]
766    struct TestEntry<'a> {
767        hanja: &'a str,
768        reading: &'a str,
769        mark: MatchMark,
770    }
771
772    fn entry<'a>(
773        hanja: &'a str,
774        reading: &'a str,
775        require_hanja: bool,
776        require_hangul: bool,
777    ) -> TestEntry<'a> {
778        TestEntry {
779            hanja,
780            reading,
781            mark: MatchMark {
782                require_hanja,
783                require_hangul,
784            },
785        }
786    }
787
788    fn fixture_bytes(entries: &[TestEntry<'_>]) -> Vec<u8> {
789        let mut metadata = BTreeMap::new();
790        metadata.insert("source".to_owned(), "fixture".to_owned());
791        metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
792        metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
793        metadata.insert("entry_count".to_owned(), entries.len().to_string());
794        metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
795        metadata.insert(
796            "max_word_chars".to_owned(),
797            entries
798                .iter()
799                .map(|entry| entry.hanja.chars().count())
800                .max()
801                .unwrap_or(0)
802                .to_string(),
803        );
804        metadata.insert(
805            "max_key_bytes".to_owned(),
806            entries
807                .iter()
808                .map(|entry| entry.hanja.len())
809                .max()
810                .unwrap_or(0)
811                .to_string(),
812        );
813        let mut metadata_bytes = Vec::new();
814        into_writer(&metadata, &mut metadata_bytes).unwrap();
815
816        let mut readings = Vec::new();
817        let mut builder = MapBuilder::memory();
818        let mut sorted = entries.to_vec();
819        sorted.sort_by(|left, right| left.hanja.cmp(right.hanja));
820        for entry in sorted {
821            let reading_offset = readings.len() as u64;
822            let value = (entry.reading.len() as u64)
823                | (u64::from(encode_mark(entry.mark)) << VALUE_MARK_SHIFT)
824                | (reading_offset << VALUE_OFFSET_SHIFT);
825            builder.insert(entry.hanja.as_bytes(), value).unwrap();
826            readings.extend_from_slice(entry.reading.as_bytes());
827        }
828        let fst_bytes = builder.into_inner().unwrap();
829
830        let metadata_offset = FIXED_HEADER_LEN as u64;
831        let fst_offset = metadata_offset + metadata_bytes.len() as u64;
832        let readings_offset = fst_offset + fst_bytes.len() as u64;
833        let mut output = Vec::new();
834        output.extend_from_slice(MAGIC);
835        output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
836        output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
837        output.extend_from_slice(&metadata_offset.to_le_bytes());
838        output.extend_from_slice(&(metadata_bytes.len() as u64).to_le_bytes());
839        output.extend_from_slice(&fst_offset.to_le_bytes());
840        output.extend_from_slice(&(fst_bytes.len() as u64).to_le_bytes());
841        output.extend_from_slice(&readings_offset.to_le_bytes());
842        output.extend_from_slice(&(readings.len() as u64).to_le_bytes());
843        output.extend(metadata_bytes);
844        output.extend(fst_bytes);
845        output.extend(readings);
846        output
847    }
848
849    fn encode_mark(mark: MatchMark) -> u8 {
850        let mut encoded = 0;
851        if mark.require_hanja {
852            encoded |= MARK_REQUIRE_HANJA;
853        }
854        if mark.require_hangul {
855            encoded |= MARK_REQUIRE_HANGUL;
856        }
857        encoded
858    }
859
860    fn assert_equivalent_dictionaries(left: &FstDictionary, right: &FstDictionary) {
861        assert_eq!(left.metadata(), right.metadata());
862        assert_eq!(left.entry_count(), right.entry_count());
863        assert_eq!(left.max_word_chars(), right.max_word_chars());
864        for key in ["天地", "漢字", "色깔論"] {
865            assert_eq!(left.lookup(key).unwrap(), right.lookup(key).unwrap());
866        }
867        assert_eq!(
868            left.matches_at("色깔論이다").collect::<Vec<_>>(),
869            right.matches_at("色깔論이다").collect::<Vec<_>>()
870        );
871        assert_eq!(
872            left.entries().unwrap().collect::<Vec<_>>(),
873            right.entries().unwrap().collect::<Vec<_>>()
874        );
875        assert_eq!(
876            left.has_homophone("漢字", "한자"),
877            right.has_homophone("漢字", "한자")
878        );
879    }
880
881    fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
882        proptest::collection::btree_map(
883            "[一-龥]{1,3}",
884            ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
885            1..16,
886        )
887        .prop_map(|entries| {
888            entries
889                .into_iter()
890                .map(|(hanja, (reading, require_hanja, require_hangul))| {
891                    (hanja, reading, require_hanja, require_hangul)
892                })
893                .collect()
894        })
895    }
896}