Skip to main content

gukhanmun_fst/
lib.rs

1// Gukhanmun: FST dictionary backend for Gukhanmun.
2// Copyright (C) 2026  Hong Minhee
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, either version 3 of the License, or
7// (at your option) any later version.
8//
9// This program is distributed in the hope that it will be useful,
10// but WITHOUT ANY WARRANTY; without even the implied warranty of
11// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12// GNU General Public License for more details.
13//
14// You should have received a copy of the GNU General Public License
15// along with this program.  If not, see <https://www.gnu.org/licenses/>.
16
17//! FST dictionary backend for Gukhanmun.
18
19#![deny(missing_docs)]
20#![deny(unsafe_code)]
21#![deny(unsafe_op_in_unsafe_fn)]
22
23use std::collections::BTreeMap;
24use std::fs;
25use std::io::{Cursor, Read};
26use std::ops::Range;
27use std::path::Path;
28use std::sync::Arc;
29
30use ciborium::de::from_reader;
31use fst::automaton::Automaton;
32use fst::{IntoStreamer, Map, Streamer};
33use gukhanmun_core::{DictionaryRecord, HanjaDictionary, Match, MatchMark};
34
35const MAGIC: &[u8; 8] = b"GUKHMFST";
36const FORMAT_VERSION: u32 = 1;
37const FIXED_HEADER_LEN: usize = 64;
38const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
39const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
40const VALUE_READING_LEN_MASK: u64 = 0xffff;
41const VALUE_MARK_SHIFT: u64 = 16;
42const VALUE_OFFSET_SHIFT: u64 = 24;
43
44/// Dictionary backed by a Gukhanmun FST file.
45///
46/// The fixed header and CBOR metadata are decoded eagerly.  The FST map bytes
47/// and reading table share one backing byte source: owned heap bytes for
48/// [`FstDictionary::open`] and [`FstDictionary::from_bytes`], or static bytes
49/// for [`FstDictionary::from_static_bytes`].
50#[derive(Clone, Debug)]
51pub struct FstDictionary {
52    metadata: BTreeMap<String, String>,
53    map: Map<ByteSection>,
54    readings: ByteSection,
55    entry_count: u64,
56    max_word_chars: Option<usize>,
57}
58
59impl FstDictionary {
60    /// Opens a dictionary file from disk.
61    pub fn open(path: impl AsRef<Path>) -> Result<Self, Error> {
62        let path = path.as_ref();
63        tracing::info!(path = %path.display(), "opening FST dictionary");
64        let bytes = fs::read(path).map_err(|source| Error::Io {
65            path: path.display().to_string(),
66            source,
67        })?;
68        Self::from_bytes(&bytes)
69    }
70
71    /// Decodes a dictionary from bytes in the Gukhanmun FST file format.
72    pub fn from_bytes(bytes: &[u8]) -> Result<Self, Error> {
73        Self::from_source(ByteSource::Owned(Arc::<[u8]>::from(bytes)))
74    }
75
76    /// Decodes a dictionary from static bytes in the Gukhanmun FST file format.
77    ///
78    /// This is intended for embedded dictionaries built with `include_bytes!`.
79    /// The FST map and reading table borrow from the static byte slice without
80    /// copying either section.
81    pub fn from_static_bytes(bytes: &'static [u8]) -> Result<Self, Error> {
82        Self::from_source(ByteSource::Static(bytes))
83    }
84
85    fn from_source(source: ByteSource) -> Result<Self, Error> {
86        let bytes = source.as_ref();
87        let header = FixedHeader::parse(bytes)?;
88        let metadata_bytes = checked_slice(bytes, header.metadata_offset, header.metadata_len)
89            .ok_or(Error::SectionOutOfBounds {
90                section: "metadata",
91            })?;
92        let metadata = from_reader::<BTreeMap<String, String>, _>(metadata_bytes)
93            .map_err(|source| Error::MetadataDecode { source })?;
94        let fst_bytes = source
95            .section(header.fst_offset, header.fst_len)
96            .ok_or(Error::SectionOutOfBounds { section: "FST" })?;
97        let readings = source
98            .section(header.readings_offset, header.readings_len)
99            .ok_or(Error::SectionOutOfBounds {
100                section: "readings",
101            })?;
102        let map = Map::new(fst_bytes).map_err(|source| Error::FstDecode { source })?;
103        let entry_count = parse_u64_metadata(&metadata, "entry_count")
104            .unwrap_or_else(|| u64::try_from(map.len()).unwrap_or(u64::MAX));
105        let max_word_chars = parse_usize_metadata(&metadata, "max_word_chars")
106            .or_else(|| max_key_chars_from_map(&map));
107
108        tracing::debug!(
109            byte_length = bytes.len(),
110            format_version = FORMAT_VERSION,
111            entry_count,
112            ?max_word_chars,
113            "decoded FST dictionary"
114        );
115        Ok(Self {
116            metadata,
117            map,
118            readings,
119            entry_count,
120            max_word_chars,
121        })
122    }
123
124    /// Returns build metadata embedded in the dictionary file.
125    pub fn metadata(&self) -> &BTreeMap<String, String> {
126        &self.metadata
127    }
128
129    /// Returns the number of entries recorded at build time.
130    pub fn entry_count(&self) -> u64 {
131        self.entry_count
132    }
133
134    /// Returns the exact dictionary entry for `hanja`, if present.
135    pub fn lookup(&self, hanja: &str) -> Result<Option<LookupEntry>, Error> {
136        let Some(encoded) = self.map.get(hanja.as_bytes()) else {
137            return Ok(None);
138        };
139        self.decode_entry(encoded).map(Some)
140    }
141
142    fn decode_entry(&self, encoded: u64) -> Result<LookupEntry, Error> {
143        let (reading_len, mark, reading_offset) = decode_value(encoded);
144        let reading_start =
145            usize::try_from(reading_offset).map_err(|_| Error::ValueOutOfRange {
146                field: "reading offset",
147            })?;
148        let reading_end =
149            reading_start
150                .checked_add(usize::from(reading_len))
151                .ok_or(Error::ValueOverflow {
152                    field: "reading range",
153                })?;
154        let reading_bytes = self
155            .readings
156            .as_ref()
157            .get(reading_start..reading_end)
158            .ok_or(Error::SectionOutOfBounds {
159                section: "reading table entry",
160            })?;
161        let reading = std::str::from_utf8(reading_bytes)
162            .map_err(|source| Error::InvalidUtf8 {
163                field: "reading",
164                source,
165            })?
166            .to_owned();
167
168        Ok(LookupEntry { reading, mark })
169    }
170}
171
172#[derive(Clone, Debug)]
173enum ByteSource {
174    Owned(Arc<[u8]>),
175    Static(&'static [u8]),
176}
177
178impl ByteSource {
179    fn section(&self, offset: u64, len: u64) -> Option<ByteSection> {
180        let offset = usize::try_from(offset).ok()?;
181        let len = usize::try_from(len).ok()?;
182        let end = offset.checked_add(len)?;
183        (end <= self.as_ref().len()).then(|| ByteSection {
184            source: self.clone(),
185            range: offset..end,
186        })
187    }
188}
189
190impl AsRef<[u8]> for ByteSource {
191    fn as_ref(&self) -> &[u8] {
192        match self {
193            Self::Owned(bytes) => bytes,
194            Self::Static(bytes) => bytes,
195        }
196    }
197}
198
199#[derive(Clone, Debug)]
200struct ByteSection {
201    source: ByteSource,
202    range: Range<usize>,
203}
204
205impl AsRef<[u8]> for ByteSection {
206    fn as_ref(&self) -> &[u8] {
207        &self.source.as_ref()[self.range.clone()]
208    }
209}
210
211impl HanjaDictionary for FstDictionary {
212    fn matches_at<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Match> + 'a> {
213        let mut stream = self
214            .map
215            .search(KeyIsPrefixOf::new(s.as_bytes()))
216            .into_stream();
217        let mut matches = Vec::new();
218        while let Some((key, encoded)) = stream.next() {
219            match self.decode_entry(encoded) {
220                Ok(entry) => {
221                    matches.push(Match {
222                        byte_len: key.len(),
223                        reading: entry.reading,
224                        suffix_reading: None,
225                        mark: entry.mark,
226                    });
227                }
228                Err(error) => {
229                    if let Ok(key_str) = std::str::from_utf8(key) {
230                        tracing::warn!(key = key_str, error = ?error, "skipping FST entry with undecodable value");
231                    } else {
232                        tracing::warn!(key_len = key.len(), error = ?error, "skipping FST entry with non-UTF-8 key and undecodable value");
233                    }
234                }
235            }
236        }
237        matches.sort_by_key(|matched| matched.byte_len);
238        Box::new(matches.into_iter())
239    }
240
241    fn max_word_chars(&self) -> Option<usize> {
242        self.max_word_chars
243    }
244
245    fn entries<'a>(&'a self) -> Option<Box<dyn Iterator<Item = DictionaryRecord> + 'a>> {
246        let mut stream = self.map.stream();
247        let mut records = Vec::new();
248        while let Some((key, encoded)) = stream.next() {
249            let Ok(hanja) = std::str::from_utf8(key) else {
250                continue;
251            };
252            if let Ok(entry) = self.decode_entry(encoded) {
253                records.push(DictionaryRecord {
254                    hanja: hanja.to_owned(),
255                    reading: entry.reading,
256                    mark: entry.mark,
257                });
258            }
259        }
260        Some(Box::new(records.into_iter()))
261    }
262
263    fn has_homophone(&self, hanja: &str, reading: &str) -> bool {
264        let mut stream = self.map.stream();
265        while let Some((key, encoded)) = stream.next() {
266            if key == hanja.as_bytes() {
267                continue;
268            }
269            if self
270                .decode_entry(encoded)
271                .is_ok_and(|entry| entry.reading == reading)
272            {
273                return true;
274            }
275        }
276        false
277    }
278}
279
280/// A decoded exact-match dictionary entry.
281#[derive(Clone, Debug, Eq, PartialEq)]
282pub struct LookupEntry {
283    reading: String,
284    mark: MatchMark,
285}
286
287impl LookupEntry {
288    /// Returns the hangul reading for the entry.
289    pub fn reading(&self) -> &str {
290        &self.reading
291    }
292
293    /// Returns dictionary-provided rendering constraints.
294    pub fn mark(&self) -> MatchMark {
295        self.mark
296    }
297}
298
299/// Error returned while opening or decoding an FST dictionary.
300#[derive(Debug, thiserror::Error)]
301#[non_exhaustive]
302pub enum Error {
303    /// Reading a dictionary file from disk failed.
304    #[error("failed to read {path}: {source}")]
305    Io {
306        /// Path that failed to open or read.
307        path: String,
308        /// Underlying I/O error.
309        #[source]
310        source: std::io::Error,
311    },
312
313    /// The input is shorter than the fixed FST header.
314    #[error("dictionary file is shorter than the fixed header: {actual} bytes")]
315    ShortHeader {
316        /// Number of bytes supplied by the caller.
317        actual: usize,
318    },
319
320    /// The fixed header magic bytes do not identify a Gukhanmun FST file.
321    #[error("invalid dictionary magic")]
322    InvalidMagic,
323
324    /// The file format version is not supported by this crate.
325    #[error("unsupported dictionary version {version}")]
326    UnsupportedVersion {
327        /// Version read from the fixed header.
328        version: u32,
329    },
330
331    /// The fixed header length field is not supported.
332    #[error("unsupported dictionary header length {header_len}")]
333    UnsupportedHeaderLength {
334        /// Header length read from the fixed header.
335        header_len: u32,
336    },
337
338    /// Reading the fixed header failed.
339    #[error("failed to read dictionary header: {source}")]
340    HeaderRead {
341        /// Underlying read error.
342        #[source]
343        source: std::io::Error,
344    },
345
346    /// A section range from the header points outside the file.
347    #[error("{section} range is outside the file")]
348    SectionOutOfBounds {
349        /// Name of the section that was out of bounds.
350        section: &'static str,
351    },
352
353    /// CBOR metadata could not be decoded.
354    #[error("failed to decode dictionary metadata: {source}")]
355    MetadataDecode {
356        /// Underlying CBOR decode error.
357        #[source]
358        source: ciborium::de::Error<std::io::Error>,
359    },
360
361    /// The embedded FST map could not be decoded.
362    #[error("failed to decode FST map: {source}")]
363    FstDecode {
364        /// Underlying FST decode error.
365        #[source]
366        source: fst::Error,
367    },
368
369    /// A packed FST value did not fit the host representation.
370    #[error("{field} is too large")]
371    ValueOutOfRange {
372        /// Field that exceeded its valid range.
373        field: &'static str,
374    },
375
376    /// A packed FST value overflowed while computing a range.
377    #[error("{field} overflow")]
378    ValueOverflow {
379        /// Field that overflowed.
380        field: &'static str,
381    },
382
383    /// A UTF-8 string field was invalid.
384    #[error("{field} contains invalid UTF-8: {source}")]
385    InvalidUtf8 {
386        /// Field that contained invalid UTF-8.
387        field: &'static str,
388        /// Underlying UTF-8 error.
389        #[source]
390        source: std::str::Utf8Error,
391    },
392}
393
394#[derive(Clone, Copy, Debug, Eq, PartialEq)]
395struct FixedHeader {
396    metadata_offset: u64,
397    metadata_len: u64,
398    fst_offset: u64,
399    fst_len: u64,
400    readings_offset: u64,
401    readings_len: u64,
402}
403
404impl FixedHeader {
405    fn parse(bytes: &[u8]) -> Result<Self, Error> {
406        if bytes.len() < FIXED_HEADER_LEN {
407            return Err(Error::ShortHeader {
408                actual: bytes.len(),
409            });
410        }
411        if &bytes[..8] != MAGIC {
412            return Err(Error::InvalidMagic);
413        }
414        let version = read_u32(&bytes[8..12]);
415        if version != FORMAT_VERSION {
416            tracing::error!(
417                version,
418                expected = FORMAT_VERSION,
419                "unsupported FST format version"
420            );
421            return Err(Error::UnsupportedVersion { version });
422        }
423        let header_len = read_u32(&bytes[12..16]);
424        if header_len != FIXED_HEADER_LEN as u32 {
425            return Err(Error::UnsupportedHeaderLength { header_len });
426        }
427        let mut cursor = Cursor::new(&bytes[16..FIXED_HEADER_LEN]);
428        Ok(Self {
429            metadata_offset: read_next_u64(&mut cursor)?,
430            metadata_len: read_next_u64(&mut cursor)?,
431            fst_offset: read_next_u64(&mut cursor)?,
432            fst_len: read_next_u64(&mut cursor)?,
433            readings_offset: read_next_u64(&mut cursor)?,
434            readings_len: read_next_u64(&mut cursor)?,
435        })
436    }
437}
438
439#[derive(Clone, Copy, Debug)]
440struct KeyIsPrefixOf<'a> {
441    bytes: &'a [u8],
442}
443
444impl<'a> KeyIsPrefixOf<'a> {
445    fn new(bytes: &'a [u8]) -> Self {
446        Self { bytes }
447    }
448}
449
450impl Automaton for KeyIsPrefixOf<'_> {
451    type State = Option<usize>;
452
453    fn start(&self) -> Self::State {
454        Some(0)
455    }
456
457    fn is_match(&self, state: &Self::State) -> bool {
458        state.is_some()
459    }
460
461    fn can_match(&self, state: &Self::State) -> bool {
462        state.is_some()
463    }
464
465    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
466        let position = (*state)?;
467        if self.bytes.get(position).copied() == Some(byte) {
468            Some(position + 1)
469        } else {
470            None
471        }
472    }
473}
474
475fn decode_value(value: u64) -> (u16, MatchMark, u64) {
476    let reading_len = (value & VALUE_READING_LEN_MASK) as u16;
477    let mark = decode_mark(((value >> VALUE_MARK_SHIFT) & 0xff) as u8);
478    let reading_offset = value >> VALUE_OFFSET_SHIFT;
479    (reading_len, mark, reading_offset)
480}
481
482fn decode_mark(encoded: u8) -> MatchMark {
483    MatchMark {
484        require_hanja: encoded & MARK_REQUIRE_HANJA != 0,
485        require_hangul: encoded & MARK_REQUIRE_HANGUL != 0,
486    }
487}
488
489fn parse_u64_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<u64> {
490    metadata.get(key).and_then(|value| value.parse().ok())
491}
492
493fn parse_usize_metadata(metadata: &BTreeMap<String, String>, key: &str) -> Option<usize> {
494    metadata.get(key).and_then(|value| value.parse().ok())
495}
496
497fn max_key_chars_from_map<D>(map: &Map<D>) -> Option<usize>
498where
499    D: AsRef<[u8]>,
500{
501    let mut stream = map.keys();
502    let mut max = None;
503    while let Some(key) = stream.next() {
504        let Ok(key) = std::str::from_utf8(key) else {
505            continue;
506        };
507        let chars = key.chars().count();
508        max = Some(max.map_or(chars, |current: usize| current.max(chars)));
509    }
510    max
511}
512
513fn read_u32(bytes: &[u8]) -> u32 {
514    u32::from_le_bytes(bytes.try_into().expect("slice has exactly four bytes"))
515}
516
517fn read_next_u64(cursor: &mut Cursor<&[u8]>) -> Result<u64, Error> {
518    let mut bytes = [0; 8];
519    cursor
520        .read_exact(&mut bytes)
521        .map_err(|source| Error::HeaderRead { source })?;
522    Ok(u64::from_le_bytes(bytes))
523}
524
525fn checked_slice(bytes: &[u8], offset: u64, len: u64) -> Option<&[u8]> {
526    let offset = usize::try_from(offset).ok()?;
527    let len = usize::try_from(len).ok()?;
528    bytes.get(offset..offset.checked_add(len)?)
529}
530
531#[cfg(test)]
532mod tests {
533    use std::collections::BTreeMap;
534    use std::fs;
535
536    use ciborium::ser::into_writer;
537    use fst::MapBuilder;
538    use gukhanmun_core::{MapDictionary, RenderMode, convert_plain_text};
539    use proptest::prelude::*;
540    use tempfile::tempdir;
541    use tracing_test::traced_test;
542
543    use super::{FstDictionary, HanjaDictionary, MatchMark};
544
545    const MAGIC: &[u8; 8] = b"GUKHMFST";
546    const FORMAT_VERSION: u32 = 1;
547    const FIXED_HEADER_LEN: usize = 64;
548    const MARK_REQUIRE_HANJA: u8 = 0b0000_0001;
549    const MARK_REQUIRE_HANGUL: u8 = 0b0000_0010;
550    const VALUE_MARK_SHIFT: u64 = 16;
551    const VALUE_OFFSET_SHIFT: u64 = 24;
552
553    #[test]
554    fn loads_valid_bytes_metadata_and_lookup() {
555        let bytes = fixture_bytes(&[
556            entry("天地", "천지", false, false),
557            entry("漢字", "한자", true, false),
558            entry("色깔論", "색깔론", false, true),
559        ]);
560
561        let dictionary = FstDictionary::from_bytes(&bytes).unwrap();
562
563        assert_eq!(dictionary.entry_count(), 3);
564        assert_eq!(dictionary.metadata().get("source").unwrap(), "fixture");
565        assert_eq!(dictionary.max_word_chars(), Some(3));
566        let hanja = dictionary.lookup("漢字").unwrap().unwrap();
567        assert_eq!(hanja.reading(), "한자");
568        assert!(hanja.mark().require_hanja);
569        assert!(!hanja.mark().require_hangul);
570        let mixed = dictionary.lookup("色깔論").unwrap().unwrap();
571        assert_eq!(mixed.reading(), "색깔론");
572        assert!(!mixed.mark().require_hanja);
573        assert!(mixed.mark().require_hangul);
574    }
575
576    #[test]
577    fn open_reads_a_dictionary_file() {
578        let temp = tempdir().unwrap();
579        let path = temp.path().join("dict.gukfst");
580        fs::write(&path, fixture_bytes(&[entry("天地", "천지", false, false)])).unwrap();
581
582        let dictionary = FstDictionary::open(&path).unwrap();
583
584        assert_eq!(
585            dictionary.lookup("天地").unwrap().unwrap().reading(),
586            "천지"
587        );
588    }
589
590    #[test]
591    fn from_static_bytes_matches_owned_loading() {
592        let bytes = fixture_bytes(&[
593            entry("天地", "천지", false, false),
594            entry("漢字", "한자", true, false),
595            entry("色깔論", "색깔론", false, true),
596        ]);
597        let static_bytes = Box::leak(bytes.clone().into_boxed_slice());
598        let owned = FstDictionary::from_bytes(&bytes).unwrap();
599        let static_dict = FstDictionary::from_static_bytes(static_bytes).unwrap();
600
601        assert_equivalent_dictionaries(&owned, &static_dict);
602    }
603
604    #[test]
605    fn dictionary_is_send_sync() {
606        fn assert_send_sync<T: Send + Sync>() {}
607
608        assert_send_sync::<FstDictionary>();
609    }
610
611    #[traced_test]
612    #[test]
613    fn unsupported_version_emits_error_event() {
614        let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
615        let mut bad_version = valid.clone();
616        bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
617
618        let result = FstDictionary::from_bytes(&bad_version);
619
620        assert!(matches!(
621            result.unwrap_err(),
622            super::Error::UnsupportedVersion { version: 999 }
623        ));
624        assert!(logs_contain("unsupported FST format version"));
625    }
626
627    #[test]
628    fn rejects_malformed_headers() {
629        let valid = fixture_bytes(&[entry("天地", "천지", false, false)]);
630        let mut bad_magic = valid.clone();
631        bad_magic[0] = b'X';
632        assert!(matches!(
633            FstDictionary::from_bytes(&bad_magic).unwrap_err(),
634            super::Error::InvalidMagic
635        ));
636
637        let mut bad_version = valid.clone();
638        bad_version[8..12].copy_from_slice(&999u32.to_le_bytes());
639        assert!(matches!(
640            FstDictionary::from_bytes(&bad_version).unwrap_err(),
641            super::Error::UnsupportedVersion { version: 999 }
642        ));
643
644        let truncated = &valid[..valid.len() - 1];
645        assert!(matches!(
646            FstDictionary::from_bytes(truncated).unwrap_err(),
647            super::Error::SectionOutOfBounds {
648                section: "readings"
649            }
650        ));
651    }
652
653    #[test]
654    fn decode_errors_preserve_structured_variants_and_sources() {
655        let mut invalid_metadata = fixture_bytes(&[entry("天地", "천지", false, false)]);
656        let metadata_offset = FIXED_HEADER_LEN;
657        invalid_metadata[metadata_offset] = 0xff;
658        let metadata_error = FstDictionary::from_bytes(&invalid_metadata).unwrap_err();
659        assert!(matches!(
660            metadata_error,
661            super::Error::MetadataDecode { .. }
662        ));
663        assert!(std::error::Error::source(&metadata_error).is_some());
664
665        let mut invalid_reading = fixture_bytes(&[entry("天地", "천지", false, false)]);
666        *invalid_reading.last_mut().unwrap() = 0xff;
667        let dictionary = FstDictionary::from_bytes(&invalid_reading).unwrap();
668        let utf8_error = dictionary.lookup("天地").unwrap_err();
669        assert!(matches!(
670            utf8_error,
671            super::Error::InvalidUtf8 {
672                field: "reading",
673                ..
674            }
675        ));
676        assert!(std::error::Error::source(&utf8_error).is_some());
677    }
678
679    #[test]
680    fn matches_at_returns_every_prefix_match() {
681        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
682            entry("行事", "행사", false, false),
683            entry("行事場", "행사장", false, false),
684            entry("場所", "장소", false, false),
685        ]))
686        .unwrap();
687
688        let matches = dictionary.matches_at("行事場入口").collect::<Vec<_>>();
689
690        assert_eq!(matches.len(), 2);
691        assert_eq!(matches[0].byte_len, "行事".len());
692        assert_eq!(matches[0].reading, "행사");
693        assert_eq!(matches[1].byte_len, "行事場".len());
694        assert_eq!(matches[1].reading, "행사장");
695    }
696
697    #[test]
698    fn has_homophone_detects_other_forms_with_same_reading() {
699        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
700            entry("漢字", "한자", false, false),
701            entry("翰字", "한자", false, false),
702            entry("天地", "천지", false, false),
703        ]))
704        .unwrap();
705
706        assert!(dictionary.has_homophone("漢字", "한자"));
707        assert!(!dictionary.has_homophone("天地", "천지"));
708    }
709
710    #[test]
711    fn lattice_regressions_pass_with_fst_backend() {
712        let dictionary = FstDictionary::from_bytes(&fixture_bytes(&[
713            entry("行事", "행사", false, false),
714            entry("行事場", "행사장", false, false),
715            entry("場所", "장소", false, false),
716            entry("入口", "입구", false, false),
717            entry("汽車길", "기찻길", false, false),
718        ]))
719        .unwrap();
720
721        assert_eq!(
722            convert_plain_text("行事場入口", &dictionary, RenderMode::HangulHanjaParens),
723            "행사장(行事場)입구(入口)"
724        );
725        assert_eq!(
726            convert_plain_text("行事場所", &dictionary, RenderMode::HangulHanjaParens),
727            "행사(行事)장소(場所)"
728        );
729        assert_eq!(
730            convert_plain_text("汽車길", &dictionary, RenderMode::HangulHanjaParens),
731            "기찻길(汽車길)"
732        );
733    }
734
735    proptest! {
736        #[test]
737        fn generated_fst_matches_map_dictionary(entries in unique_entries()) {
738            let bytes = fixture_bytes(
739                &entries
740                    .iter()
741                    .map(|(hanja, reading, require_hanja, require_hangul)| {
742                        entry(hanja, reading, *require_hanja, *require_hangul)
743                    })
744                    .collect::<Vec<_>>()
745            );
746            let fst = FstDictionary::from_bytes(&bytes).unwrap();
747            let mut map = MapDictionary::new();
748            for (hanja, reading, require_hanja, require_hangul) in entries {
749                map.insert_marked(
750                    &hanja,
751                    &reading,
752                    MatchMark {
753                        require_hanja,
754                        require_hangul,
755                    },
756                );
757                let fst_matches = fst.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
758                let map_matches = map.matches_at(&format!("{hanja}뒤")).collect::<Vec<_>>();
759                prop_assert_eq!(fst_matches, map_matches);
760                let lookup = fst.lookup(&hanja).unwrap().unwrap();
761                prop_assert_eq!(lookup.reading(), reading.as_str());
762            }
763        }
764    }
765
766    #[derive(Clone, Copy, Debug)]
767    struct TestEntry<'a> {
768        hanja: &'a str,
769        reading: &'a str,
770        mark: MatchMark,
771    }
772
773    fn entry<'a>(
774        hanja: &'a str,
775        reading: &'a str,
776        require_hanja: bool,
777        require_hangul: bool,
778    ) -> TestEntry<'a> {
779        TestEntry {
780            hanja,
781            reading,
782            mark: MatchMark {
783                require_hanja,
784                require_hangul,
785            },
786        }
787    }
788
789    fn fixture_bytes(entries: &[TestEntry<'_>]) -> Vec<u8> {
790        let mut metadata = BTreeMap::new();
791        metadata.insert("source".to_owned(), "fixture".to_owned());
792        metadata.insert("license".to_owned(), "CC0-1.0".to_owned());
793        metadata.insert("build_date".to_owned(), "1970-01-01T00:00:00Z".to_owned());
794        metadata.insert("entry_count".to_owned(), entries.len().to_string());
795        metadata.insert("version".to_owned(), FORMAT_VERSION.to_string());
796        metadata.insert(
797            "max_word_chars".to_owned(),
798            entries
799                .iter()
800                .map(|entry| entry.hanja.chars().count())
801                .max()
802                .unwrap_or(0)
803                .to_string(),
804        );
805        metadata.insert(
806            "max_key_bytes".to_owned(),
807            entries
808                .iter()
809                .map(|entry| entry.hanja.len())
810                .max()
811                .unwrap_or(0)
812                .to_string(),
813        );
814        let mut metadata_bytes = Vec::new();
815        into_writer(&metadata, &mut metadata_bytes).unwrap();
816
817        let mut readings = Vec::new();
818        let mut builder = MapBuilder::memory();
819        let mut sorted = entries.to_vec();
820        sorted.sort_by(|left, right| left.hanja.cmp(right.hanja));
821        for entry in sorted {
822            let reading_offset = readings.len() as u64;
823            let value = (entry.reading.len() as u64)
824                | (u64::from(encode_mark(entry.mark)) << VALUE_MARK_SHIFT)
825                | (reading_offset << VALUE_OFFSET_SHIFT);
826            builder.insert(entry.hanja.as_bytes(), value).unwrap();
827            readings.extend_from_slice(entry.reading.as_bytes());
828        }
829        let fst_bytes = builder.into_inner().unwrap();
830
831        let metadata_offset = FIXED_HEADER_LEN as u64;
832        let fst_offset = metadata_offset + metadata_bytes.len() as u64;
833        let readings_offset = fst_offset + fst_bytes.len() as u64;
834        let mut output = Vec::new();
835        output.extend_from_slice(MAGIC);
836        output.extend_from_slice(&FORMAT_VERSION.to_le_bytes());
837        output.extend_from_slice(&(FIXED_HEADER_LEN as u32).to_le_bytes());
838        output.extend_from_slice(&metadata_offset.to_le_bytes());
839        output.extend_from_slice(&(metadata_bytes.len() as u64).to_le_bytes());
840        output.extend_from_slice(&fst_offset.to_le_bytes());
841        output.extend_from_slice(&(fst_bytes.len() as u64).to_le_bytes());
842        output.extend_from_slice(&readings_offset.to_le_bytes());
843        output.extend_from_slice(&(readings.len() as u64).to_le_bytes());
844        output.extend(metadata_bytes);
845        output.extend(fst_bytes);
846        output.extend(readings);
847        output
848    }
849
850    fn encode_mark(mark: MatchMark) -> u8 {
851        let mut encoded = 0;
852        if mark.require_hanja {
853            encoded |= MARK_REQUIRE_HANJA;
854        }
855        if mark.require_hangul {
856            encoded |= MARK_REQUIRE_HANGUL;
857        }
858        encoded
859    }
860
861    fn assert_equivalent_dictionaries(left: &FstDictionary, right: &FstDictionary) {
862        assert_eq!(left.metadata(), right.metadata());
863        assert_eq!(left.entry_count(), right.entry_count());
864        assert_eq!(left.max_word_chars(), right.max_word_chars());
865        for key in ["天地", "漢字", "色깔論"] {
866            assert_eq!(left.lookup(key).unwrap(), right.lookup(key).unwrap());
867        }
868        assert_eq!(
869            left.matches_at("色깔論이다").collect::<Vec<_>>(),
870            right.matches_at("色깔論이다").collect::<Vec<_>>()
871        );
872        assert_eq!(
873            left.entries().unwrap().collect::<Vec<_>>(),
874            right.entries().unwrap().collect::<Vec<_>>()
875        );
876        assert_eq!(
877            left.has_homophone("漢字", "한자"),
878            right.has_homophone("漢字", "한자")
879        );
880    }
881
882    fn unique_entries() -> impl Strategy<Value = Vec<(String, String, bool, bool)>> {
883        proptest::collection::btree_map(
884            "[一-龥]{1,3}",
885            ("[가-힣]{1,4}", any::<bool>(), any::<bool>()),
886            1..16,
887        )
888        .prop_map(|entries| {
889            entries
890                .into_iter()
891                .map(|(hanja, (reading, require_hanja, require_hangul))| {
892                    (hanja, reading, require_hanja, require_hangul)
893                })
894                .collect()
895        })
896    }
897}