Skip to main content

moine_zh/
lib.rs

1//! Chinese pinyin and CC-CEDICT adapters for `moine`.
2//!
3//! The current adapter indexes simplified and traditional written Chinese forms
4//! with Mandarin pinyin readings from CC-CEDICT. The default public artifact
5//! view is no-tone pinyin; `tone3` is an explicit tone-aware artifact view.
6//! Cantonese, Jyutping, and non-Mandarin readings are outside this crate's
7//! current scope.
8//!
9//! Dictionary artifacts are external input. Prefer `try_*` lookup and expansion
10//! APIs at trust boundaries so indexed-payload decode errors are reported as
11//! [`ZhArtifactPayloadError`] instead of being collapsed into empty lookup
12//! results for backward-compatible convenience APIs.
13//!
14//! ```
15//! use moine_zh::{
16//!     compare_with_zh_index, PinyinReadingOptions, ZhReadingIndex, ZhReadingIndexPayload,
17//!     ZhReadingIndexPayloadEntry,
18//! };
19//!
20//! let payload = ZhReadingIndexPayload {
21//!     schema_version: 1,
22//!     payload_type: "moine.zh.reading-index.surface-readings".to_string(),
23//!     pinyin_view: "no-tone".to_string(),
24//!     entries: vec![ZhReadingIndexPayloadEntry {
25//!         surface: "威士忌".to_string(),
26//!         readings: vec!["weishiji".to_string()],
27//!     }],
28//! };
29//! let index = ZhReadingIndex::from_artifact_payload(payload).unwrap();
30//!
31//! assert_eq!(
32//!     compare_with_zh_index("weishiji", "威士忌", &index, PinyinReadingOptions::default())
33//!         .unwrap()
34//!         .lattice,
35//!     0,
36//! );
37//! ```
38//!
39#![deny(missing_docs)]
40
41use std::borrow::Cow;
42use std::collections::{btree_map::Entry, BTreeMap, BTreeSet, HashMap};
43use std::error::Error;
44use std::fmt;
45use std::fmt::Write as _;
46use std::fs::File;
47use std::io::{BufRead, BufReader, Read, Write};
48use std::path::Path;
49use std::string::FromUtf8Error;
50use std::sync::Arc;
51
52use fst::{Map, MapBuilder, Streamer};
53use memmap2::Mmap;
54use moine_core::{
55    damerau_distance, damerau_levenshtein_str, distance, levenshtein_str,
56    normalized_similarity_str, Lattice,
57};
58use serde::{Deserialize, Serialize};
59use sha2::{Digest, Sha256};
60
61const ARTIFACT_PAYLOAD_SCHEMA_VERSION: u32 = 1;
62const ARTIFACT_PAYLOAD_TYPE: &str = "moine.zh.reading-index.surface-readings";
63const INDEXED_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEZ01";
64const INDEXED_ARTIFACT_VERSION: u32 = 1;
65const INDEXED_ARTIFACT_HEADER_LEN: usize = 40;
66const MAX_ARTIFACT_PAYLOAD_BYTES: u64 = 512 * 1024 * 1024;
67const MAX_ARTIFACT_ENTRIES: usize = 2_000_000;
68const MAX_ARTIFACT_READINGS_PER_ENTRY: usize = 256;
69const MAX_ARTIFACT_STRING_BYTES: usize = 16 * 1024;
70/// Current canonical checksum algorithm for normalized Chinese payload content.
71pub const ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "sha256-canonical-v1";
72/// File digest algorithm used to verify payload bytes before loading.
73pub const ARTIFACT_PAYLOAD_FILE_DIGEST_ALGORITHM: &str = "sha256-file-v1";
74
75/// Pinyin representation used by a Chinese reading index.
76#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
77pub enum PinyinView {
78    /// Pinyin without tone marks or tone numbers.
79    #[default]
80    NoTone,
81    /// Pinyin with tone numbers, such as `zhong1`.
82    Tone3,
83}
84
85/// Options used while building a CC-CEDICT reading index.
86#[derive(Clone, Copy, Debug, Eq, PartialEq)]
87pub struct CedictIndexOptions {
88    /// Pinyin representation to store.
89    pub pinyin_view: PinyinView,
90    /// Optional cap on readings stored for each surface form.
91    pub max_readings_per_surface: Option<usize>,
92}
93
94/// Controls Chinese dictionary reading-path expansion.
95#[derive(Clone, Copy, Debug, Eq, PartialEq)]
96pub struct PinyinReadingOptions {
97    /// Maximum surface span length considered for one dictionary segment.
98    pub max_span_chars: usize,
99    /// Maximum complete reading paths to keep.
100    pub max_paths: usize,
101    /// Prefer the longest dictionary span when multiple spans start together.
102    pub longest_match_only: bool,
103    /// Optional cap on readings used per dictionary segment.
104    pub max_readings_per_segment: Option<usize>,
105}
106
107/// One Chinese surface segment and its selected pinyin reading.
108#[derive(Clone, Debug, Eq, PartialEq)]
109pub struct PinyinReadingSegment {
110    /// Surface text covered by the segment.
111    pub surface: String,
112    /// Pinyin reading selected for the segment.
113    pub reading: String,
114}
115
116/// One complete segmentation and joined pinyin reading for an input string.
117#[derive(Clone, Debug, Eq, PartialEq)]
118pub struct PinyinReadingPath {
119    /// Ordered dictionary/direct segments in the path.
120    pub segments: Vec<PinyinReadingSegment>,
121    /// Segment readings concatenated into one pinyin string.
122    pub joined_reading: String,
123}
124
125/// Reading-path expansion result plus pruning statistics.
126#[derive(Clone, Debug, Default, Eq, PartialEq)]
127pub struct PinyinReadingExpansion {
128    /// Expanded pinyin paths.
129    pub paths: Vec<PinyinReadingPath>,
130    /// Statistics gathered during expansion.
131    pub stats: PinyinReadingStats,
132}
133
134/// Counters describing Chinese reading-path expansion.
135#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
136pub struct PinyinReadingStats {
137    /// Dictionary spans matched during expansion.
138    pub matched_spans: usize,
139    /// Direct fallback spans used when no dictionary span matched.
140    pub direct_fallback_spans: usize,
141    /// Candidate spans pruned by longest-match mode.
142    pub longest_match_pruned_spans: usize,
143    /// Raw readings seen before per-segment pruning.
144    pub raw_segment_readings: usize,
145    /// Readings retained after per-segment pruning.
146    pub used_segment_readings: usize,
147    /// Readings removed by per-segment pruning.
148    pub pruned_segment_readings: usize,
149    /// Candidate path combinations considered.
150    pub candidate_combinations: usize,
151    /// Unique complete pinyin paths retained.
152    pub unique_paths: usize,
153    /// Duplicate joined readings removed.
154    pub duplicate_joined_readings: usize,
155    /// Number of times the `max_paths` cap was hit.
156    pub max_paths_hit_count: usize,
157}
158
159/// Distances computed for one Chinese comparison.
160#[derive(Clone, Copy, Debug, Eq, PartialEq)]
161pub struct ChineseDistance {
162    /// Plain Levenshtein distance over the original strings.
163    pub surface_levenshtein: usize,
164    /// Plain Damerau-Levenshtein distance over the original strings.
165    pub surface_damerau: usize,
166    /// Lattice Path Edit Distance over pinyin reading lattices.
167    pub lattice: usize,
168    /// Lattice-aware Damerau-Levenshtein distance over reading lattices.
169    pub lattice_damerau: usize,
170    /// Minimum of surface Damerau-Levenshtein and non-Damerau LPED.
171    ///
172    /// This intentionally does not include `lattice_damerau`; use that field
173    /// directly when lattice-side adjacent transpositions should count as one
174    /// edit.
175    pub combined: usize,
176}
177
178/// Public alias for the Chinese reading index type.
179pub type ZhReadingIndex = CedictReadingIndex;
180
181/// CC-CEDICT-derived surface-to-pinyin reading index.
182#[derive(Clone, Debug)]
183pub struct CedictReadingIndex {
184    storage: ZhReadingStorage,
185    pinyin_view: PinyinView,
186}
187
188#[derive(Clone, Debug)]
189enum ZhReadingStorage {
190    Eager(HashMap<String, Vec<String>>),
191    Indexed(IndexedZhPayload),
192}
193
194impl Default for CedictReadingIndex {
195    fn default() -> Self {
196        Self {
197            storage: ZhReadingStorage::Eager(HashMap::new()),
198            pinyin_view: PinyinView::default(),
199        }
200    }
201}
202
203impl PartialEq for CedictReadingIndex {
204    fn eq(&self, other: &Self) -> bool {
205        self.pinyin_view == other.pinyin_view && self.artifact_payload() == other.artifact_payload()
206    }
207}
208
209impl Eq for CedictReadingIndex {}
210
211#[derive(Clone, Debug)]
212struct IndexedZhPayload {
213    mmap: Arc<Mmap>,
214    map: Map<Vec<u8>>,
215    readings_start: usize,
216    entries: usize,
217}
218
219/// Header for indexed FST Chinese payloads.
220#[derive(Clone, Copy, Debug, Eq, PartialEq)]
221pub struct ZhIndexedArtifactPayloadHeader {
222    /// Indexed payload format version.
223    pub version: u32,
224    /// Pinyin representation stored in the payload.
225    pub pinyin_view: PinyinView,
226    /// Number of entries in the payload.
227    pub entries: usize,
228    /// Length of the embedded FST section in bytes.
229    pub fst_len: usize,
230    /// Length of the reading blob section in bytes.
231    pub readings_len: usize,
232}
233
234/// Metadata stored in a Chinese dictionary bundle.
235#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
236pub struct ZhArtifactMetadata {
237    /// Metadata schema version.
238    pub schema_version: u32,
239    /// Artifact type identifier.
240    pub artifact_type: String,
241    /// Human-readable artifact name.
242    pub artifact_name: String,
243    /// Tool or command that generated the artifact.
244    pub generator: String,
245    /// Payload file metadata.
246    pub payload: ZhArtifactPayload,
247    /// Source dictionary metadata.
248    pub source: ZhArtifactSource,
249    /// Build-time options and counts.
250    pub build: ZhArtifactBuild,
251    /// Default query options for this artifact.
252    pub query_defaults: ZhArtifactQueryDefaults,
253    /// License metadata and references.
254    pub license: ZhArtifactLicense,
255}
256
257/// Payload metadata stored in a Chinese dictionary bundle.
258#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
259pub struct ZhArtifactPayload {
260    /// Relative payload path inside the bundle.
261    pub path: String,
262    /// Payload format identifier.
263    pub format: String,
264    /// Optional digest algorithm for the payload file bytes.
265    #[serde(default, skip_serializing_if = "Option::is_none")]
266    pub file_digest_algorithm: Option<String>,
267    /// Optional digest of the payload file bytes.
268    #[serde(default, skip_serializing_if = "Option::is_none")]
269    pub file_digest: Option<String>,
270    /// Canonical payload checksum algorithm.
271    pub checksum_algorithm: String,
272    /// Canonical payload checksum.
273    pub checksum: String,
274}
275
276/// Source dictionary metadata for a Chinese artifact.
277#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
278pub struct ZhArtifactSource {
279    /// Source dictionary name.
280    pub name: String,
281    /// Source dictionary version or release date.
282    pub version: String,
283    /// Path or label for the CC-CEDICT source file.
284    pub cedict: String,
285}
286
287/// Build-time settings recorded in Chinese artifact metadata.
288#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
289pub struct ZhArtifactBuild {
290    /// Pinyin representation stored in the artifact.
291    pub pinyin_view: String,
292    /// Maximum readings retained for each surface form, if capped.
293    pub max_readings_per_surface: Option<usize>,
294    /// Number of surface entries in the payload.
295    pub entries: usize,
296}
297
298/// Default reading expansion options recorded in Chinese artifact metadata.
299#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
300pub struct ZhArtifactQueryDefaults {
301    /// Maximum surface span length considered for one dictionary segment.
302    pub max_span_chars: usize,
303    /// Maximum complete reading paths retained.
304    pub max_paths: usize,
305    /// Whether longest-match mode is enabled by default.
306    pub longest_match_only: bool,
307    /// Optional default cap on readings used per dictionary segment.
308    pub max_readings_per_segment: Option<usize>,
309}
310
311/// License metadata for a Chinese dictionary artifact.
312#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
313pub struct ZhArtifactLicense {
314    /// Selected license expression for the distributed artifact.
315    pub selected_license: String,
316    /// License files or notices bundled with the artifact.
317    pub references: Vec<ZhArtifactLicenseReference>,
318}
319
320/// One license reference stored in Chinese artifact metadata.
321#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
322pub struct ZhArtifactLicenseReference {
323    /// Human-readable license reference label.
324    pub label: String,
325    /// Relative path to the bundled license or notice file.
326    pub path: String,
327}
328
329/// Normalized Chinese reading-index payload.
330#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
331pub struct ZhReadingIndexPayload {
332    /// Payload schema version.
333    pub schema_version: u32,
334    /// Payload type identifier.
335    pub payload_type: String,
336    /// Pinyin representation used by all readings in the payload.
337    pub pinyin_view: String,
338    /// Surface-to-reading entries.
339    pub entries: Vec<ZhReadingIndexPayloadEntry>,
340}
341
342/// One surface form and its normalized pinyin readings.
343#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
344pub struct ZhReadingIndexPayloadEntry {
345    /// Simplified or traditional surface form.
346    pub surface: String,
347    /// Normalized pinyin readings for the surface form.
348    pub readings: Vec<String>,
349}
350
351/// Inputs used to build Chinese artifact metadata from an index.
352#[derive(Clone, Debug, Eq, PartialEq)]
353pub struct ZhArtifactMetadataOptions {
354    /// Human-readable artifact name.
355    pub artifact_name: String,
356    /// Tool or command that generated the artifact.
357    pub generator: String,
358    /// Payload file name recorded in metadata.
359    pub payload_file_name: String,
360    /// Payload format recorded in metadata.
361    pub payload_format: String,
362    /// Source dictionary name.
363    pub source_name: String,
364    /// Source dictionary version or release date.
365    pub source_version: String,
366    /// Path or label for the CC-CEDICT source file.
367    pub source_cedict: String,
368    /// Index build options to record.
369    pub index_options: CedictIndexOptions,
370    /// Default query options to record.
371    pub query_defaults: PinyinReadingOptions,
372    /// License metadata to record.
373    pub license: ZhArtifactLicense,
374}
375
376/// Errors returned while parsing CC-CEDICT source text.
377#[derive(Debug)]
378pub enum CedictError {
379    /// Filesystem or reader access failed.
380    Io(std::io::Error),
381    /// A non-comment CC-CEDICT line could not be parsed.
382    InvalidEntry {
383        /// One-based input line number.
384        line: usize,
385        /// Parse failure detail.
386        message: String,
387    },
388}
389
390/// Errors returned while loading or validating Chinese artifact payloads.
391#[derive(Debug)]
392pub enum ZhArtifactPayloadError {
393    /// Filesystem or reader access failed.
394    Io(std::io::Error),
395    /// YAML payload deserialization failed.
396    Yaml(serde_yaml::Error),
397    /// Indexed payload magic bytes do not match the Chinese artifact format.
398    InvalidIndexedMagic {
399        /// Magic bytes read from the payload.
400        magic: [u8; 8],
401    },
402    /// Indexed payload version is not supported.
403    UnsupportedIndexedVersion {
404        /// Version read from the payload header.
405        version: u32,
406    },
407    /// Indexed payload pinyin-view tag is not supported.
408    UnsupportedIndexedPinyinView {
409        /// Numeric pinyin-view tag from the payload header.
410        value: u32,
411    },
412    /// An indexed payload section length cannot fit in memory on this target.
413    IndexedSectionTooLarge {
414        /// Section field name.
415        field: &'static str,
416        /// Section length from the payload.
417        len: u64,
418    },
419    /// A configured artifact safety limit was exceeded.
420    ArtifactLimitExceeded {
421        /// Limited field name.
422        field: &'static str,
423        /// Observed length or count.
424        len: u64,
425        /// Maximum accepted length or count.
426        max: u64,
427    },
428    /// Reserved indexed-payload header bytes were non-zero.
429    NonZeroIndexedReserved {
430        /// Reserved header value.
431        value: u32,
432    },
433    /// The indexed payload ended before a required section was complete.
434    TruncatedIndexed {
435        /// Section or field being read.
436        field: &'static str,
437    },
438    /// The embedded FST section is invalid.
439    InvalidIndexedFst {
440        /// FST validation failure detail.
441        message: String,
442    },
443    /// Header entry count and FST entry count disagree.
444    IndexedEntryCountMismatch {
445        /// Entry count recorded in the header.
446        header_entries: usize,
447        /// Entry count observed in the FST.
448        fst_entries: usize,
449    },
450    /// A reading block offset points outside the reading section.
451    InvalidIndexedOffset {
452        /// Invalid offset value.
453        offset: u64,
454    },
455    /// Indexed payload bytes were not valid UTF-8.
456    InvalidIndexedUtf8 {
457        /// Field being decoded.
458        field: &'static str,
459        /// Underlying UTF-8 error.
460        source: FromUtf8Error,
461    },
462    /// YAML payload schema version is not supported.
463    UnsupportedSchemaVersion {
464        /// Schema version read from the payload.
465        version: u32,
466    },
467    /// YAML payload type does not identify a Chinese reading index.
468    UnsupportedPayloadType {
469        /// Payload type read from YAML.
470        payload_type: String,
471    },
472    /// Payload pinyin view is not supported.
473    UnsupportedPinyinView {
474        /// Pinyin-view string read from the payload.
475        pinyin_view: String,
476    },
477    /// A payload entry has an empty surface form.
478    EmptySurface {
479        /// Zero-based entry index.
480        entry_index: usize,
481    },
482    /// The payload contains a duplicate surface form.
483    DuplicateSurface {
484        /// Duplicate surface form.
485        surface: String,
486    },
487    /// A surface form has no readings.
488    EmptyReadings {
489        /// Surface form with no readings.
490        surface: String,
491    },
492    /// A surface form has an empty reading.
493    EmptyReading {
494        /// Surface form containing the empty reading.
495        surface: String,
496        /// Zero-based reading index.
497        reading_index: usize,
498    },
499    /// A surface form has a duplicate reading.
500    DuplicateReading {
501        /// Surface form containing the duplicate.
502        surface: String,
503        /// Duplicate reading.
504        reading: String,
505    },
506    /// A reading was not normalized for the artifact pinyin view.
507    ReadingNotNormalized {
508        /// Surface form containing the invalid reading.
509        surface: String,
510        /// Reading as stored in the payload.
511        reading: String,
512        /// Expected normalized reading.
513        normalized: String,
514    },
515}
516
517/// Errors returned while building Chinese pinyin lattices.
518#[derive(Debug, Eq, PartialEq)]
519pub enum CnLatticeError {
520    /// No pinyin readings were provided.
521    EmptyReadings,
522    /// Input cannot be interpreted as direct pinyin and has no dictionary path.
523    UnsupportedDirectInput {
524        /// Unsupported input surface.
525        surface: String,
526    },
527    /// Artifact loading or indexed payload decoding failed.
528    ArtifactPayload(String),
529}
530
531impl PinyinView {
532    /// Returns the stable artifact string for this pinyin view.
533    pub fn as_str(self) -> &'static str {
534        match self {
535            Self::NoTone => "no-tone",
536            Self::Tone3 => "tone3",
537        }
538    }
539}
540
541impl TryFrom<&str> for PinyinView {
542    type Error = ();
543
544    fn try_from(value: &str) -> Result<Self, Self::Error> {
545        match value {
546            "no-tone" | "notone" | "normal" => Ok(Self::NoTone),
547            "tone3" => Ok(Self::Tone3),
548            _ => Err(()),
549        }
550    }
551}
552
553impl Default for CedictIndexOptions {
554    fn default() -> Self {
555        Self {
556            pinyin_view: PinyinView::NoTone,
557            max_readings_per_surface: None,
558        }
559    }
560}
561
562impl Default for PinyinReadingOptions {
563    fn default() -> Self {
564        Self {
565            max_span_chars: 8,
566            max_paths: 1024,
567            longest_match_only: false,
568            max_readings_per_segment: None,
569        }
570    }
571}
572
573impl Default for ZhArtifactLicense {
574    fn default() -> Self {
575        Self {
576            selected_license: "CC BY-SA 4.0".to_string(),
577            references: vec![ZhArtifactLicenseReference {
578                label: "CC-CEDICT".to_string(),
579                path: "license/CC-CEDICT.md".to_string(),
580            }],
581        }
582    }
583}
584
585impl fmt::Display for CedictError {
586    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587        match self {
588            Self::Io(err) => write!(f, "failed to read CC-CEDICT: {err}"),
589            Self::InvalidEntry { line, message } => {
590                write!(f, "invalid CC-CEDICT entry at line {line}: {message}")
591            }
592        }
593    }
594}
595
596impl Error for CedictError {
597    fn source(&self) -> Option<&(dyn Error + 'static)> {
598        match self {
599            Self::Io(err) => Some(err),
600            Self::InvalidEntry { .. } => None,
601        }
602    }
603}
604
605impl fmt::Display for ZhArtifactPayloadError {
606    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
607        match self {
608            Self::Io(err) => write!(f, "failed to read zh artifact payload: {err}"),
609            Self::Yaml(err) => write!(f, "invalid zh artifact payload YAML: {err}"),
610            Self::InvalidIndexedMagic { magic } => {
611                write!(f, "invalid zh indexed artifact magic {magic:?}")
612            }
613            Self::UnsupportedIndexedVersion { version } => {
614                write!(f, "unsupported zh indexed artifact version {version}")
615            }
616            Self::UnsupportedIndexedPinyinView { value } => {
617                write!(f, "unsupported zh indexed artifact pinyin view {value}")
618            }
619            Self::IndexedSectionTooLarge { field, len } => {
620                write!(f, "zh indexed artifact {field} length {len} exceeds usize::MAX")
621            }
622            Self::ArtifactLimitExceeded { field, len, max } => {
623                write!(f, "zh artifact {field} length/count {len} exceeds limit {max}")
624            }
625            Self::NonZeroIndexedReserved { value } => {
626                write!(f, "zh indexed artifact reserved header field is {value}")
627            }
628            Self::TruncatedIndexed { field } => {
629                write!(f, "truncated zh indexed artifact while reading {field}")
630            }
631            Self::InvalidIndexedFst { message } => {
632                write!(f, "invalid zh indexed artifact FST: {message}")
633            }
634            Self::IndexedEntryCountMismatch {
635                header_entries,
636                fst_entries,
637            } => write!(
638                f,
639                "zh indexed artifact header entry count {header_entries} does not match FST entry count {fst_entries}"
640            ),
641            Self::InvalidIndexedOffset { offset } => {
642                write!(f, "invalid zh indexed artifact readings offset {offset}")
643            }
644            Self::InvalidIndexedUtf8 { field, source } => {
645                write!(f, "invalid UTF-8 in zh indexed artifact {field}: {source}")
646            }
647            Self::UnsupportedSchemaVersion { version } => {
648                write!(f, "unsupported zh artifact payload schema version {version}")
649            }
650            Self::UnsupportedPayloadType { payload_type } => {
651                write!(f, "unsupported zh artifact payload type {payload_type:?}")
652            }
653            Self::UnsupportedPinyinView { pinyin_view } => {
654                write!(f, "unsupported zh artifact pinyin view {pinyin_view:?}")
655            }
656            Self::EmptySurface { entry_index } => {
657                write!(f, "zh artifact payload entry {entry_index} has an empty surface")
658            }
659            Self::DuplicateSurface { surface } => {
660                write!(f, "zh artifact payload has duplicate surface {surface:?}")
661            }
662            Self::EmptyReadings { surface } => {
663                write!(f, "zh artifact payload surface {surface:?} has no readings")
664            }
665            Self::EmptyReading {
666                surface,
667                reading_index,
668            } => write!(
669                f,
670                "zh artifact payload surface {surface:?} has an empty reading at index {reading_index}"
671            ),
672            Self::DuplicateReading { surface, reading } => write!(
673                f,
674                "zh artifact payload surface {surface:?} has duplicate reading {reading:?}"
675            ),
676            Self::ReadingNotNormalized {
677                surface,
678                reading,
679                normalized,
680            } => write!(
681                f,
682                "zh artifact payload surface {surface:?} has non-normalized reading {reading:?}; expected {normalized:?}"
683            ),
684        }
685    }
686}
687
688impl Error for ZhArtifactPayloadError {
689    fn source(&self) -> Option<&(dyn Error + 'static)> {
690        match self {
691            Self::Io(err) => Some(err),
692            Self::Yaml(err) => Some(err),
693            Self::InvalidIndexedUtf8 { source, .. } => Some(source),
694            _ => None,
695        }
696    }
697}
698
699impl From<std::io::Error> for CedictError {
700    fn from(err: std::io::Error) -> Self {
701        Self::Io(err)
702    }
703}
704
705impl From<std::io::Error> for ZhArtifactPayloadError {
706    fn from(err: std::io::Error) -> Self {
707        Self::Io(err)
708    }
709}
710
711impl From<serde_yaml::Error> for ZhArtifactPayloadError {
712    fn from(err: serde_yaml::Error) -> Self {
713        Self::Yaml(err)
714    }
715}
716
717impl fmt::Display for CnLatticeError {
718    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719        match self {
720            Self::EmptyReadings => write!(f, "at least one pinyin reading is required"),
721            Self::UnsupportedDirectInput { surface } => {
722                write!(f, "unsupported direct pinyin input {surface:?}")
723            }
724            Self::ArtifactPayload(err) => write!(f, "{err}"),
725        }
726    }
727}
728
729impl Error for CnLatticeError {}
730
731impl CedictReadingIndex {
732    /// Builds an index from a CC-CEDICT text file.
733    pub fn from_cedict_path(path: impl AsRef<Path>) -> Result<Self, CedictError> {
734        Self::from_cedict_path_with_options(path, CedictIndexOptions::default())
735    }
736
737    /// Builds an index from a CC-CEDICT text file with custom options.
738    pub fn from_cedict_path_with_options(
739        path: impl AsRef<Path>,
740        options: CedictIndexOptions,
741    ) -> Result<Self, CedictError> {
742        let file = File::open(path)?;
743        Self::from_cedict_reader_with_options(file, options)
744    }
745
746    /// Builds an index from a CC-CEDICT reader.
747    pub fn from_cedict_reader(reader: impl Read) -> Result<Self, CedictError> {
748        Self::from_cedict_reader_with_options(reader, CedictIndexOptions::default())
749    }
750
751    /// Builds an index from a CC-CEDICT reader with custom options.
752    pub fn from_cedict_reader_with_options(
753        reader: impl Read,
754        options: CedictIndexOptions,
755    ) -> Result<Self, CedictError> {
756        let mut by_surface = HashMap::<String, BTreeSet<String>>::new();
757        let reader = BufReader::new(reader);
758        for (line_index, line) in reader.lines().enumerate() {
759            let line_number = line_index + 1;
760            let line = line?;
761            let line = line.trim_end_matches('\r');
762            if line.is_empty() || line.starts_with('#') {
763                continue;
764            }
765
766            let entry = parse_cedict_entry(line, line_number)?;
767            let reading = normalize_pinyin(entry.pinyin, options.pinyin_view);
768            if reading.is_empty() {
769                continue;
770            }
771
772            by_surface
773                .entry(entry.traditional.to_string())
774                .or_default()
775                .insert(reading.clone());
776            by_surface
777                .entry(entry.simplified.to_string())
778                .or_default()
779                .insert(reading);
780        }
781
782        let readings_by_surface = by_surface
783            .into_iter()
784            .map(|(surface, readings)| {
785                let mut readings = readings.into_iter().collect::<Vec<_>>();
786                if let Some(max_readings) = options.max_readings_per_surface {
787                    readings.truncate(max_readings);
788                }
789                (surface, readings)
790            })
791            .filter(|(_, readings)| !readings.is_empty())
792            .collect();
793
794        Ok(Self {
795            storage: ZhReadingStorage::Eager(readings_by_surface),
796            pinyin_view: options.pinyin_view,
797        })
798    }
799
800    /// Loads a YAML artifact payload from a file path.
801    pub fn from_artifact_payload_path(
802        path: impl AsRef<Path>,
803    ) -> Result<Self, ZhArtifactPayloadError> {
804        let path = path.as_ref();
805        check_payload_file_size(path)?;
806        let file = File::open(path)?;
807        Self::from_artifact_payload_reader(file)
808    }
809
810    /// Loads a YAML artifact payload from a reader.
811    pub fn from_artifact_payload_reader(reader: impl Read) -> Result<Self, ZhArtifactPayloadError> {
812        let payload = serde_yaml::from_reader(reader)?;
813        Self::from_artifact_payload(payload)
814    }
815
816    /// Builds an index from a deserialized artifact payload.
817    pub fn from_artifact_payload(
818        payload: ZhReadingIndexPayload,
819    ) -> Result<Self, ZhArtifactPayloadError> {
820        validate_artifact_payload_header(&payload)?;
821        let pinyin_view = PinyinView::try_from(payload.pinyin_view.as_str()).map_err(|()| {
822            ZhArtifactPayloadError::UnsupportedPinyinView {
823                pinyin_view: payload.pinyin_view.clone(),
824            }
825        })?;
826        check_limit("entry_count", payload.entries.len(), MAX_ARTIFACT_ENTRIES)?;
827
828        let mut readings_by_surface = HashMap::new();
829        for (entry_index, entry) in payload.entries.into_iter().enumerate() {
830            check_limit(
831                "surface_bytes",
832                entry.surface.len(),
833                MAX_ARTIFACT_STRING_BYTES,
834            )?;
835            check_limit(
836                "reading_count",
837                entry.readings.len(),
838                MAX_ARTIFACT_READINGS_PER_ENTRY,
839            )?;
840            if entry.surface.is_empty() {
841                return Err(ZhArtifactPayloadError::EmptySurface { entry_index });
842            }
843            if entry.readings.is_empty() {
844                return Err(ZhArtifactPayloadError::EmptyReadings {
845                    surface: entry.surface,
846                });
847            }
848
849            let mut seen_readings = BTreeSet::new();
850            for (reading_index, reading) in entry.readings.iter().enumerate() {
851                check_limit("reading_bytes", reading.len(), MAX_ARTIFACT_STRING_BYTES)?;
852                if reading.is_empty() {
853                    return Err(ZhArtifactPayloadError::EmptyReading {
854                        surface: entry.surface,
855                        reading_index,
856                    });
857                }
858                let normalized = normalize_artifact_reading(reading, pinyin_view);
859                if normalized != *reading {
860                    return Err(ZhArtifactPayloadError::ReadingNotNormalized {
861                        surface: entry.surface,
862                        reading: reading.clone(),
863                        normalized,
864                    });
865                }
866                if !seen_readings.insert(reading) {
867                    return Err(ZhArtifactPayloadError::DuplicateReading {
868                        surface: entry.surface,
869                        reading: reading.clone(),
870                    });
871                }
872            }
873
874            if readings_by_surface
875                .insert(entry.surface.clone(), entry.readings)
876                .is_some()
877            {
878                return Err(ZhArtifactPayloadError::DuplicateSurface {
879                    surface: entry.surface,
880                });
881            }
882        }
883
884        Ok(Self {
885            storage: ZhReadingStorage::Eager(readings_by_surface),
886            pinyin_view,
887        })
888    }
889
890    /// Loads an indexed artifact payload from a file path using mmap-backed
891    /// storage.
892    ///
893    /// The file is validated before the index is returned, but readings remain
894    /// lazy and are decoded from the indexed payload during lookup.
895    pub fn from_indexed_artifact_payload_path(
896        path: impl AsRef<Path>,
897    ) -> Result<Self, ZhArtifactPayloadError> {
898        let path = path.as_ref();
899        check_payload_file_size(path)?;
900        let file = File::open(path)?;
901        // SAFETY: the mmap is kept alive by IndexedZhPayload for as long as
902        // any offsets or slices derived from it can be used.
903        let mmap = unsafe { Mmap::map(&file)? };
904        Self::from_indexed_mmap(mmap)
905    }
906
907    /// Loads an indexed artifact payload from bytes.
908    ///
909    /// This eagerly materializes the indexed payload and is intended for
910    /// environments such as WebAssembly where mmap-backed loading is not
911    /// available.
912    ///
913    /// # Errors
914    ///
915    /// Returns an error when the payload is too large, malformed, truncated,
916    /// has an invalid FST section, or fails canonical artifact validation.
917    pub fn from_indexed_artifact_payload_bytes(
918        bytes: &[u8],
919    ) -> Result<Self, ZhArtifactPayloadError> {
920        if bytes.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
921            return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
922                field: "payload_bytes",
923                len: bytes.len() as u64,
924                max: MAX_ARTIFACT_PAYLOAD_BYTES,
925            });
926        }
927        let header = read_indexed_artifact_payload_header_bytes(bytes)?;
928        let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
929        let fst_end = fst_start.checked_add(header.fst_len).ok_or(
930            ZhArtifactPayloadError::TruncatedIndexed {
931                field: "fst_section",
932            },
933        )?;
934        let readings_end = fst_end.checked_add(header.readings_len).ok_or(
935            ZhArtifactPayloadError::TruncatedIndexed {
936                field: "readings_section",
937            },
938        )?;
939        if bytes.len() < readings_end {
940            return Err(ZhArtifactPayloadError::TruncatedIndexed {
941                field: "indexed_payload",
942            });
943        }
944
945        let map = Map::new(bytes[fst_start..fst_end].to_vec()).map_err(|err| {
946            ZhArtifactPayloadError::InvalidIndexedFst {
947                message: err.to_string(),
948            }
949        })?;
950        let fst_entries = map.len();
951        if fst_entries != header.entries {
952            return Err(ZhArtifactPayloadError::IndexedEntryCountMismatch {
953                header_entries: header.entries,
954                fst_entries,
955            });
956        }
957
958        let mut entries = Vec::with_capacity(header.entries);
959        let mut stream = map.stream();
960        while let Some((surface, offset)) = stream.next() {
961            let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
962                ZhArtifactPayloadError::InvalidIndexedUtf8 {
963                    field: "surface",
964                    source,
965                }
966            })?;
967            let readings = read_indexed_readings_at_bytes(bytes, fst_end, offset)?;
968            entries.push(ZhReadingIndexPayloadEntry { surface, readings });
969        }
970
971        Self::from_artifact_payload(ZhReadingIndexPayload {
972            schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
973            payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
974            pinyin_view: header.pinyin_view.as_str().to_string(),
975            entries,
976        })
977    }
978
979    fn from_indexed_mmap(mmap: Mmap) -> Result<Self, ZhArtifactPayloadError> {
980        if mmap.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
981            return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
982                field: "payload_bytes",
983                len: mmap.len() as u64,
984                max: MAX_ARTIFACT_PAYLOAD_BYTES,
985            });
986        }
987        let header = read_indexed_artifact_payload_header_bytes(&mmap)?;
988        let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
989        let fst_end = fst_start.checked_add(header.fst_len).ok_or(
990            ZhArtifactPayloadError::TruncatedIndexed {
991                field: "fst_section",
992            },
993        )?;
994        let readings_end = fst_end.checked_add(header.readings_len).ok_or(
995            ZhArtifactPayloadError::TruncatedIndexed {
996                field: "readings_section",
997            },
998        )?;
999        if mmap.len() < readings_end {
1000            return Err(ZhArtifactPayloadError::TruncatedIndexed {
1001                field: "indexed_payload",
1002            });
1003        }
1004
1005        let map = Map::new(mmap[fst_start..fst_end].to_vec()).map_err(|err| {
1006            ZhArtifactPayloadError::InvalidIndexedFst {
1007                message: err.to_string(),
1008            }
1009        })?;
1010        let fst_entries = map.len();
1011        if fst_entries != header.entries {
1012            return Err(ZhArtifactPayloadError::IndexedEntryCountMismatch {
1013                header_entries: header.entries,
1014                fst_entries,
1015            });
1016        }
1017        let indexed = IndexedZhPayload {
1018            mmap: Arc::new(mmap),
1019            map,
1020            readings_start: fst_end,
1021            entries: header.entries,
1022        };
1023        indexed.validate(header.pinyin_view)?;
1024        Ok(Self {
1025            storage: ZhReadingStorage::Indexed(indexed),
1026            pinyin_view: header.pinyin_view,
1027        })
1028    }
1029
1030    /// Returns the pinyin representation stored by this index.
1031    pub fn pinyin_view(&self) -> PinyinView {
1032        self.pinyin_view
1033    }
1034
1035    /// Returns pinyin readings for `surface`, if present.
1036    ///
1037    /// For indexed artifacts, decode errors are treated the same as a missing
1038    /// surface for backward compatibility. Use [`Self::try_readings`] at trust
1039    /// boundaries when artifact corruption must be reported distinctly.
1040    pub fn readings(&self, surface: &str) -> Option<Cow<'_, [String]>> {
1041        self.try_readings(surface).ok().flatten()
1042    }
1043
1044    /// Returns pinyin readings for `surface` and preserves indexed artifact
1045    /// decode errors.
1046    pub fn try_readings(
1047        &self,
1048        surface: &str,
1049    ) -> Result<Option<Cow<'_, [String]>>, ZhArtifactPayloadError> {
1050        match &self.storage {
1051            ZhReadingStorage::Eager(readings_by_surface) => Ok(readings_by_surface
1052                .get(surface)
1053                .map(|readings| Cow::Borrowed(readings.as_slice()))),
1054            ZhReadingStorage::Indexed(indexed) => indexed
1055                .readings(surface)
1056                .map(|readings| readings.map(Cow::Owned)),
1057        }
1058    }
1059
1060    /// Returns the number of indexed Chinese surface forms.
1061    pub fn len(&self) -> usize {
1062        match &self.storage {
1063            ZhReadingStorage::Eager(readings_by_surface) => readings_by_surface.len(),
1064            ZhReadingStorage::Indexed(indexed) => indexed.entries,
1065        }
1066    }
1067
1068    /// Returns `true` when the index contains no surface forms.
1069    pub fn is_empty(&self) -> bool {
1070        self.len() == 0
1071    }
1072
1073    /// Builds bundle metadata for the current index and caller-provided
1074    /// provenance.
1075    ///
1076    /// The returned metadata includes a canonical payload checksum computed
1077    /// from the normalized payload view.
1078    pub fn artifact_metadata(&self, options: ZhArtifactMetadataOptions) -> ZhArtifactMetadata {
1079        ZhArtifactMetadata {
1080            schema_version: 1,
1081            artifact_type: "moine.zh.reading-index".to_string(),
1082            artifact_name: options.artifact_name,
1083            generator: options.generator,
1084            payload: ZhArtifactPayload {
1085                path: options.payload_file_name,
1086                format: options.payload_format,
1087                file_digest_algorithm: None,
1088                file_digest: None,
1089                checksum_algorithm: ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM.to_string(),
1090                checksum: self.artifact_payload_checksum(),
1091            },
1092            source: ZhArtifactSource {
1093                name: options.source_name,
1094                version: options.source_version,
1095                cedict: options.source_cedict,
1096            },
1097            build: ZhArtifactBuild {
1098                pinyin_view: options.index_options.pinyin_view.as_str().to_string(),
1099                max_readings_per_surface: options.index_options.max_readings_per_surface,
1100                entries: self.len(),
1101            },
1102            query_defaults: ZhArtifactQueryDefaults {
1103                max_span_chars: options.query_defaults.max_span_chars,
1104                max_paths: options.query_defaults.max_paths,
1105                longest_match_only: options.query_defaults.longest_match_only,
1106                max_readings_per_segment: options.query_defaults.max_readings_per_segment,
1107            },
1108            license: options.license,
1109        }
1110    }
1111
1112    /// Returns the normalized YAML-compatible payload view for this index.
1113    ///
1114    /// Entries are sorted by surface form so serialization and checksums are
1115    /// deterministic regardless of the index storage backend.
1116    pub fn artifact_payload(&self) -> ZhReadingIndexPayload {
1117        let entries = match &self.storage {
1118            ZhReadingStorage::Eager(readings_by_surface) => {
1119                let mut entries = readings_by_surface
1120                    .iter()
1121                    .map(|(surface, readings)| ZhReadingIndexPayloadEntry {
1122                        surface: surface.clone(),
1123                        readings: readings.clone(),
1124                    })
1125                    .collect::<Vec<_>>();
1126                entries.sort_by(|left, right| left.surface.cmp(&right.surface));
1127                entries
1128            }
1129            ZhReadingStorage::Indexed(indexed) => indexed
1130                .entries()
1131                .expect("validated indexed artifact should decode"),
1132        };
1133
1134        ZhReadingIndexPayload {
1135            schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1136            payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1137            pinyin_view: self.pinyin_view.as_str().to_string(),
1138            entries,
1139        }
1140    }
1141
1142    /// Returns the canonical checksum for the normalized payload.
1143    pub fn artifact_payload_checksum(&self) -> String {
1144        self.artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
1145            .expect("default artifact checksum algorithm should be supported")
1146    }
1147
1148    /// Returns a canonical payload checksum for `algorithm`.
1149    ///
1150    /// Unknown algorithms return `None`.
1151    pub fn artifact_payload_checksum_for_algorithm(&self, algorithm: &str) -> Option<String> {
1152        let payload = self.artifact_payload();
1153        let bytes = canonical_payload_bytes(&payload);
1154        match algorithm {
1155            ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(sha256_hex(&bytes)),
1156            _ => None,
1157        }
1158    }
1159
1160    /// Writes the indexed FST-backed artifact payload format.
1161    ///
1162    /// The payload stores a finite-state transducer from surface form to an
1163    /// offset in a compact reading blob and can be loaded with
1164    /// [`Self::from_indexed_artifact_payload_path`].
1165    pub fn write_indexed_artifact_payload(
1166        &self,
1167        mut writer: impl Write,
1168    ) -> Result<(), ZhArtifactPayloadError> {
1169        let payload = self.artifact_payload();
1170        let mut fst_bytes = Vec::new();
1171        let mut readings_bytes = Vec::new();
1172        {
1173            let mut builder = MapBuilder::new(&mut fst_bytes).map_err(|err| {
1174                ZhArtifactPayloadError::InvalidIndexedFst {
1175                    message: err.to_string(),
1176                }
1177            })?;
1178            for entry in &payload.entries {
1179                let offset = readings_bytes.len() as u64;
1180                builder.insert(&entry.surface, offset).map_err(|err| {
1181                    ZhArtifactPayloadError::InvalidIndexedFst {
1182                        message: err.to_string(),
1183                    }
1184                })?;
1185                write_indexed_reading_block(&mut readings_bytes, &entry.readings)?;
1186            }
1187            builder
1188                .finish()
1189                .map_err(|err| ZhArtifactPayloadError::InvalidIndexedFst {
1190                    message: err.to_string(),
1191                })?;
1192        }
1193
1194        writer.write_all(INDEXED_ARTIFACT_MAGIC)?;
1195        writer.write_all(&INDEXED_ARTIFACT_VERSION.to_le_bytes())?;
1196        writer.write_all(&pinyin_view_header_value(self.pinyin_view).to_le_bytes())?;
1197        writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1198        writer.write_all(&(fst_bytes.len() as u64).to_le_bytes())?;
1199        writer.write_all(&(readings_bytes.len() as u64).to_le_bytes())?;
1200        writer.write_all(&fst_bytes)?;
1201        writer.write_all(&readings_bytes)?;
1202        Ok(())
1203    }
1204
1205    /// Expands `text` into joined pinyin reading strings.
1206    ///
1207    /// This is a compatibility helper over [`Self::reading_paths`]. It drops
1208    /// segment boundaries and treats indexed artifact decode errors as an empty
1209    /// expansion.
1210    pub fn reading_sequences(&self, text: &str, options: PinyinReadingOptions) -> Vec<String> {
1211        self.reading_paths(text, options)
1212            .into_iter()
1213            .map(|path| path.joined_reading)
1214            .collect()
1215    }
1216
1217    /// Expands `text` into dictionary-only pinyin reading paths.
1218    ///
1219    /// Every returned path contains surface/reading segment boundaries plus the
1220    /// joined pinyin reading. Use [`Self::try_reading_paths_with_stats`] when
1221    /// indexed artifact corruption must be reported.
1222    pub fn reading_paths(
1223        &self,
1224        text: &str,
1225        options: PinyinReadingOptions,
1226    ) -> Vec<PinyinReadingPath> {
1227        self.reading_paths_with_stats(text, options).paths
1228    }
1229
1230    /// Expands dictionary pinyin paths and treats artifact decode errors as an
1231    /// empty expansion for backward compatibility.
1232    ///
1233    /// Use [`Self::try_reading_paths_with_stats`] when loading indexed
1234    /// artifacts from outside the process trust boundary.
1235    pub fn reading_paths_with_stats(
1236        &self,
1237        text: &str,
1238        options: PinyinReadingOptions,
1239    ) -> PinyinReadingExpansion {
1240        self.try_reading_paths_with_stats(text, options)
1241            .unwrap_or_default()
1242    }
1243
1244    /// Expands dictionary pinyin paths and preserves indexed artifact decode
1245    /// errors.
1246    pub fn try_reading_paths_with_stats(
1247        &self,
1248        text: &str,
1249        options: PinyinReadingOptions,
1250    ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1251        self.reading_paths_with_stats_inner(text, options, false)
1252    }
1253
1254    /// Expands `text` into pinyin reading paths with direct fallback segments.
1255    ///
1256    /// Dictionary matches are preferred, but ASCII pinyin spans can pass
1257    /// through directly so mixed dictionary/direct input can still form a full
1258    /// path.
1259    pub fn hybrid_reading_paths(
1260        &self,
1261        text: &str,
1262        options: PinyinReadingOptions,
1263    ) -> Vec<PinyinReadingPath> {
1264        self.hybrid_reading_paths_with_stats(text, options).paths
1265    }
1266
1267    /// Expands hybrid dictionary/direct pinyin paths and treats artifact decode
1268    /// errors as an empty expansion for backward compatibility.
1269    ///
1270    /// Use [`Self::try_hybrid_reading_paths_with_stats`] when loading indexed
1271    /// artifacts from outside the process trust boundary.
1272    pub fn hybrid_reading_paths_with_stats(
1273        &self,
1274        text: &str,
1275        options: PinyinReadingOptions,
1276    ) -> PinyinReadingExpansion {
1277        self.try_hybrid_reading_paths_with_stats(text, options)
1278            .unwrap_or_default()
1279    }
1280
1281    /// Expands hybrid dictionary/direct pinyin paths and preserves indexed
1282    /// artifact decode errors.
1283    pub fn try_hybrid_reading_paths_with_stats(
1284        &self,
1285        text: &str,
1286        options: PinyinReadingOptions,
1287    ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1288        self.reading_paths_with_stats_inner(text, options, true)
1289    }
1290
1291    /// Builds a pinyin lattice from dictionary-only readings of `text`.
1292    ///
1293    /// Returns `Ok(None)` when the dictionary cannot cover the entire input.
1294    /// Indexed artifact decode errors are reported as
1295    /// [`CnLatticeError::ArtifactPayload`].
1296    pub fn pinyin_lattice(
1297        &self,
1298        text: &str,
1299        options: PinyinReadingOptions,
1300    ) -> Result<Option<Lattice>, CnLatticeError> {
1301        let paths = self
1302            .try_reading_paths_with_stats(text, options)
1303            .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1304            .paths;
1305        if paths.is_empty() {
1306            return Ok(None);
1307        }
1308        pinyin_lattice_from_reading_paths(&paths).map(Some)
1309    }
1310
1311    /// Builds a pinyin lattice with dictionary readings and direct fallback.
1312    ///
1313    /// This is the preferred lattice builder for mixed Chinese text where
1314    /// direct pinyin spans may appear beside CC-CEDICT-backed surfaces.
1315    pub fn hybrid_pinyin_lattice(
1316        &self,
1317        text: &str,
1318        options: PinyinReadingOptions,
1319    ) -> Result<Option<Lattice>, CnLatticeError> {
1320        let paths = self
1321            .try_hybrid_reading_paths_with_stats(text, options)
1322            .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1323            .paths;
1324        if paths.is_empty() {
1325            return Ok(None);
1326        }
1327        pinyin_lattice_from_reading_paths(&paths).map(Some)
1328    }
1329
1330    fn reading_paths_with_stats_inner(
1331        &self,
1332        text: &str,
1333        options: PinyinReadingOptions,
1334        allow_direct_fallback: bool,
1335    ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1336        if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1337            return Ok(PinyinReadingExpansion::default());
1338        }
1339
1340        let mut stats = PinyinReadingStats::default();
1341        let boundaries = char_boundaries(text);
1342        let char_len = boundaries.len() - 1;
1343        let mut suffix_paths = vec![Vec::<PinyinReadingPath>::new(); char_len + 1];
1344        suffix_paths[char_len].push(PinyinReadingPath {
1345            segments: Vec::new(),
1346            joined_reading: String::new(),
1347        });
1348
1349        for start in (0..char_len).rev() {
1350            let mut paths_by_reading = BTreeMap::new();
1351            let end_limit = char_len.min(start + options.max_span_chars);
1352            let mut matching_ends = Vec::new();
1353
1354            for end in start + 1..=end_limit {
1355                let surface = &text[boundaries[start]..boundaries[end]];
1356                if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1357                    matching_ends.push(end);
1358                }
1359            }
1360            stats.matched_spans += matching_ends.len();
1361
1362            if options.longest_match_only {
1363                if let Some(end) = matching_ends.last().copied() {
1364                    stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1365                    matching_ends.clear();
1366                    matching_ends.push(end);
1367                }
1368            }
1369
1370            for end in matching_ends {
1371                let surface = &text[boundaries[start]..boundaries[end]];
1372                let Some(surface_readings) = self.try_readings(surface)? else {
1373                    continue;
1374                };
1375
1376                stats.raw_segment_readings += surface_readings.len();
1377                let raw_surface_reading_count = surface_readings.len();
1378                let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1379                stats.used_segment_readings += surface_readings.len();
1380                stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1381
1382                for surface_reading in surface_readings {
1383                    for suffix in &suffix_paths[end] {
1384                        stats.candidate_combinations += 1;
1385                        let mut reading = String::with_capacity(
1386                            surface_reading.len() + suffix.joined_reading.len(),
1387                        );
1388                        reading.push_str(surface_reading);
1389                        reading.push_str(&suffix.joined_reading);
1390
1391                        let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1392                        segments.push(PinyinReadingSegment {
1393                            surface: surface.to_string(),
1394                            reading: surface_reading.to_string(),
1395                        });
1396                        segments.extend(suffix.segments.iter().cloned());
1397
1398                        match paths_by_reading.entry(reading.clone()) {
1399                            Entry::Vacant(entry) => {
1400                                entry.insert(PinyinReadingPath {
1401                                    segments,
1402                                    joined_reading: reading,
1403                                });
1404                                stats.unique_paths += 1;
1405                            }
1406                            Entry::Occupied(_) => {
1407                                stats.duplicate_joined_readings += 1;
1408                            }
1409                        }
1410
1411                        if paths_by_reading.len() >= options.max_paths {
1412                            stats.max_paths_hit_count += 1;
1413                            break;
1414                        }
1415                    }
1416
1417                    if paths_by_reading.len() >= options.max_paths {
1418                        break;
1419                    }
1420                }
1421
1422                if paths_by_reading.len() >= options.max_paths {
1423                    break;
1424                }
1425            }
1426
1427            if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1428                if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1429                    if !suffix_paths[end].is_empty() {
1430                        stats.direct_fallback_spans += 1;
1431                        let surface = &text[boundaries[start]..boundaries[end]];
1432                        let reading = normalize_direct_ascii(surface);
1433                        for suffix in &suffix_paths[end] {
1434                            stats.candidate_combinations += 1;
1435                            let mut joined =
1436                                String::with_capacity(reading.len() + suffix.joined_reading.len());
1437                            joined.push_str(&reading);
1438                            joined.push_str(&suffix.joined_reading);
1439
1440                            let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1441                            segments.push(PinyinReadingSegment {
1442                                surface: surface.to_string(),
1443                                reading: reading.clone(),
1444                            });
1445                            segments.extend(suffix.segments.iter().cloned());
1446
1447                            match paths_by_reading.entry(joined.clone()) {
1448                                Entry::Vacant(entry) => {
1449                                    entry.insert(PinyinReadingPath {
1450                                        segments,
1451                                        joined_reading: joined,
1452                                    });
1453                                    stats.unique_paths += 1;
1454                                }
1455                                Entry::Occupied(_) => {
1456                                    stats.duplicate_joined_readings += 1;
1457                                }
1458                            }
1459
1460                            if paths_by_reading.len() >= options.max_paths {
1461                                stats.max_paths_hit_count += 1;
1462                                break;
1463                            }
1464                        }
1465                    }
1466                }
1467            }
1468
1469            suffix_paths[start] = paths_by_reading.into_values().collect();
1470        }
1471
1472        Ok(PinyinReadingExpansion {
1473            paths: suffix_paths.remove(0),
1474            stats,
1475        })
1476    }
1477}
1478
1479/// Compares two strings using direct pinyin handling and a CC-CEDICT index.
1480pub fn compare_with_cedict_index(
1481    left: &str,
1482    right: &str,
1483    index: &CedictReadingIndex,
1484    options: PinyinReadingOptions,
1485) -> Result<ChineseDistance, CnLatticeError> {
1486    compare_with_zh_index(left, right, index, options)
1487}
1488
1489/// Compares two strings using direct pinyin handling and a Chinese index.
1490pub fn compare_with_zh_index(
1491    left: &str,
1492    right: &str,
1493    index: &ZhReadingIndex,
1494    options: PinyinReadingOptions,
1495) -> Result<ChineseDistance, CnLatticeError> {
1496    let left_lattice = cedict_or_direct_lattice(left, index, options)?;
1497    let right_lattice = cedict_or_direct_lattice(right, index, options)?;
1498    Ok(compare_lattices(left, right, &left_lattice, &right_lattice))
1499}
1500
1501/// Computes the best normalized similarity across Chinese pinyin readings.
1502pub fn normalized_similarity_with_zh_index(
1503    left: &str,
1504    right: &str,
1505    index: &ZhReadingIndex,
1506    options: PinyinReadingOptions,
1507) -> Result<f64, CnLatticeError> {
1508    let left_paths = zh_or_direct_pinyin_paths(left, index, options)?;
1509    let right_paths = zh_or_direct_pinyin_paths(right, index, options)?;
1510    Ok(max_normalized_similarity(&left_paths, &right_paths))
1511}
1512
1513/// Builds a pinyin lattice from direct input, CC-CEDICT readings, or both.
1514pub fn cedict_or_direct_lattice(
1515    input: &str,
1516    index: &CedictReadingIndex,
1517    options: PinyinReadingOptions,
1518) -> Result<Lattice, CnLatticeError> {
1519    zh_or_direct_lattice(input, index, options)
1520}
1521
1522/// Builds a pinyin lattice from direct input, dictionary readings, or both.
1523pub fn zh_or_direct_lattice(
1524    input: &str,
1525    index: &ZhReadingIndex,
1526    options: PinyinReadingOptions,
1527) -> Result<Lattice, CnLatticeError> {
1528    if let Some(lattice) = direct_pinyin_lattice(input) {
1529        return Ok(lattice);
1530    }
1531
1532    if let Some(lattice) = index.pinyin_lattice(input, options)? {
1533        return Ok(lattice);
1534    }
1535
1536    if let Some(lattice) = index.hybrid_pinyin_lattice(input, options)? {
1537        return Ok(lattice);
1538    }
1539
1540    direct_pinyin_lattice(input).ok_or_else(|| CnLatticeError::UnsupportedDirectInput {
1541        surface: input.to_string(),
1542    })
1543}
1544
1545/// Returns pinyin paths from direct input, dictionary readings, or both.
1546pub fn zh_or_direct_pinyin_paths(
1547    input: &str,
1548    index: &ZhReadingIndex,
1549    options: PinyinReadingOptions,
1550) -> Result<Vec<String>, CnLatticeError> {
1551    if can_build_direct_pinyin_path(input) {
1552        return Ok(vec![normalize_direct_ascii(input)]);
1553    }
1554
1555    let paths = index
1556        .try_reading_paths_with_stats(input, options)
1557        .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1558        .paths;
1559    if !paths.is_empty() {
1560        return Ok(paths.into_iter().map(|path| path.joined_reading).collect());
1561    }
1562
1563    let paths = index
1564        .try_hybrid_reading_paths_with_stats(input, options)
1565        .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1566        .paths;
1567    if !paths.is_empty() {
1568        return Ok(paths.into_iter().map(|path| path.joined_reading).collect());
1569    }
1570
1571    Err(CnLatticeError::UnsupportedDirectInput {
1572        surface: input.to_string(),
1573    })
1574}
1575
1576fn max_normalized_similarity(left_paths: &[String], right_paths: &[String]) -> f64 {
1577    left_paths
1578        .iter()
1579        .flat_map(|left| {
1580            right_paths
1581                .iter()
1582                .map(move |right| normalized_similarity_str(left, right))
1583        })
1584        .fold(0.0, f64::max)
1585}
1586
1587/// Builds a pinyin lattice from expanded reading paths.
1588///
1589/// Each path contributes one complete pinyin string to the compact lattice.
1590/// Segment boundaries are used before this step and are not represented in the
1591/// returned lattice.
1592pub fn pinyin_lattice_from_reading_paths(
1593    paths: &[PinyinReadingPath],
1594) -> Result<Lattice, CnLatticeError> {
1595    if paths.is_empty() {
1596        return Err(CnLatticeError::EmptyReadings);
1597    }
1598
1599    Ok(Lattice::from_symbol_paths_compact(paths.iter().map(
1600        |path| {
1601            path.joined_reading
1602                .chars()
1603                .map(|ch| ch as moine_core::Symbol)
1604                .collect::<Vec<_>>()
1605        },
1606    )))
1607}
1608
1609/// Normalizes a whitespace-separated CC-CEDICT pinyin field.
1610///
1611/// In [`PinyinView::NoTone`], tone digits that follow Latin letters are
1612/// removed while numeric tokens such as `11` are preserved. In
1613/// [`PinyinView::Tone3`], tone digits are retained.
1614pub fn normalize_pinyin(raw: &str, view: PinyinView) -> String {
1615    let mut normalized = String::new();
1616    for token in raw.split_whitespace() {
1617        normalized.push_str(&normalize_pinyin_token(token, view));
1618    }
1619    match view {
1620        PinyinView::NoTone => strip_no_tone_digits(&normalized),
1621        PinyinView::Tone3 => normalized,
1622    }
1623}
1624
1625fn compare_lattices(
1626    left: &str,
1627    right: &str,
1628    left_lattice: &Lattice,
1629    right_lattice: &Lattice,
1630) -> ChineseDistance {
1631    let lattice = distance(left_lattice, right_lattice);
1632    let lattice_damerau = damerau_distance(left_lattice, right_lattice);
1633    let surface_levenshtein = levenshtein_str(left, right);
1634    let surface_damerau = damerau_levenshtein_str(left, right);
1635
1636    ChineseDistance {
1637        surface_levenshtein,
1638        surface_damerau,
1639        lattice,
1640        lattice_damerau,
1641        combined: surface_damerau.min(lattice),
1642    }
1643}
1644
1645fn direct_pinyin_lattice(input: &str) -> Option<Lattice> {
1646    if input.is_empty() || !can_build_direct_pinyin_path(input) {
1647        return None;
1648    }
1649    Some(Lattice::from_paths([normalize_direct_ascii(input)]))
1650}
1651
1652fn normalize_pinyin_token(token: &str, view: PinyinView) -> String {
1653    let lowered = token.to_lowercase().replace("u:", "v").replace('ü', "v");
1654    let contains_letters = lowered.chars().any(|ch| ch.is_ascii_alphabetic());
1655    if view == PinyinView::NoTone && contains_letters {
1656        lowered
1657            .chars()
1658            .filter(|ch| !matches!(ch, '1'..='5'))
1659            .collect()
1660    } else {
1661        lowered
1662    }
1663}
1664
1665fn normalize_direct_ascii(input: &str) -> String {
1666    input.to_lowercase().replace("u:", "v")
1667}
1668
1669fn normalize_artifact_reading(reading: &str, view: PinyinView) -> String {
1670    let lowered = reading
1671        .to_lowercase()
1672        .replace("u:", "v")
1673        .replace('ü', "v")
1674        .chars()
1675        .filter(|ch| !ch.is_whitespace())
1676        .collect::<String>();
1677    match view {
1678        PinyinView::NoTone => strip_no_tone_digits(&lowered),
1679        PinyinView::Tone3 => lowered,
1680    }
1681}
1682
1683fn strip_no_tone_digits(reading: &str) -> String {
1684    let mut previous = None;
1685    let mut normalized = String::with_capacity(reading.len());
1686    for ch in reading.chars() {
1687        if matches!(ch, '1'..='5') && previous.is_some_and(|prev: char| prev.is_ascii_alphabetic())
1688        {
1689            continue;
1690        }
1691        normalized.push(ch);
1692        previous = Some(ch);
1693    }
1694    normalized
1695}
1696
1697fn char_boundaries(text: &str) -> Vec<usize> {
1698    text.char_indices()
1699        .map(|(index, _)| index)
1700        .chain(std::iter::once(text.len()))
1701        .collect()
1702}
1703
1704fn limited_surface_readings(readings: &[String], options: PinyinReadingOptions) -> &[String] {
1705    if let Some(max_readings) = options.max_readings_per_segment {
1706        &readings[..readings.len().min(max_readings)]
1707    } else {
1708        readings
1709    }
1710}
1711
1712fn direct_fallback_end(
1713    text: &str,
1714    boundaries: &[usize],
1715    start: usize,
1716    char_len: usize,
1717) -> Option<usize> {
1718    let mut end = start;
1719    while end < char_len {
1720        let surface = &text[boundaries[start]..boundaries[end + 1]];
1721        if !can_build_direct_pinyin_path(surface) {
1722            break;
1723        }
1724        end += 1;
1725    }
1726
1727    (end > start).then_some(end)
1728}
1729
1730fn pinyin_view_header_value(view: PinyinView) -> u32 {
1731    match view {
1732        PinyinView::NoTone => 0,
1733        PinyinView::Tone3 => 1,
1734    }
1735}
1736
1737fn pinyin_view_from_header_value(value: u32) -> Result<PinyinView, ZhArtifactPayloadError> {
1738    match value {
1739        0 => Ok(PinyinView::NoTone),
1740        1 => Ok(PinyinView::Tone3),
1741        _ => Err(ZhArtifactPayloadError::UnsupportedIndexedPinyinView { value }),
1742    }
1743}
1744
1745fn write_binary_string(
1746    writer: &mut impl Write,
1747    field: &'static str,
1748    value: &str,
1749) -> Result<(), ZhArtifactPayloadError> {
1750    write_u32_len(writer, field, value.len())?;
1751    writer.write_all(value.as_bytes())?;
1752    Ok(())
1753}
1754
1755fn write_u32_len(
1756    writer: &mut impl Write,
1757    field: &'static str,
1758    len: usize,
1759) -> Result<(), ZhArtifactPayloadError> {
1760    let len = u32::try_from(len).map_err(|_| ZhArtifactPayloadError::IndexedSectionTooLarge {
1761        field,
1762        len: len as u64,
1763    })?;
1764    writer.write_all(&len.to_le_bytes())?;
1765    Ok(())
1766}
1767
1768fn read_indexed_artifact_payload_header_bytes(
1769    bytes: &[u8],
1770) -> Result<ZhIndexedArtifactPayloadHeader, ZhArtifactPayloadError> {
1771    if bytes.len() < INDEXED_ARTIFACT_HEADER_LEN {
1772        return Err(ZhArtifactPayloadError::TruncatedIndexed { field: "header" });
1773    }
1774    let mut magic = [0_u8; 8];
1775    magic.copy_from_slice(&bytes[..8]);
1776    if &magic != INDEXED_ARTIFACT_MAGIC {
1777        return Err(ZhArtifactPayloadError::InvalidIndexedMagic { magic });
1778    }
1779
1780    let version = read_u32_le_bytes(bytes, 8, "version")?;
1781    if version != INDEXED_ARTIFACT_VERSION {
1782        return Err(ZhArtifactPayloadError::UnsupportedIndexedVersion { version });
1783    }
1784    let pinyin_view = pinyin_view_from_header_value(read_u32_le_bytes(bytes, 12, "pinyin_view")?)?;
1785    let entry_count = read_u64_le_bytes(bytes, 16, "entry_count")?;
1786    let fst_len = read_u64_le_bytes(bytes, 24, "fst_len")?;
1787    let readings_len = read_u64_le_bytes(bytes, 32, "readings_len")?;
1788    let entries = checked_indexed_usize("entry_count", entry_count)?;
1789    check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1790    Ok(ZhIndexedArtifactPayloadHeader {
1791        version,
1792        pinyin_view,
1793        entries,
1794        fst_len: checked_indexed_usize("fst_len", fst_len)?,
1795        readings_len: checked_indexed_usize("readings_len", readings_len)?,
1796    })
1797}
1798
1799fn read_u32_le_bytes(
1800    bytes: &[u8],
1801    offset: usize,
1802    field: &'static str,
1803) -> Result<u32, ZhArtifactPayloadError> {
1804    let end = offset
1805        .checked_add(4)
1806        .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1807    let chunk = bytes
1808        .get(offset..end)
1809        .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1810    Ok(u32::from_le_bytes(
1811        chunk.try_into().expect("slice length is 4"),
1812    ))
1813}
1814
1815fn read_u64_le_bytes(
1816    bytes: &[u8],
1817    offset: usize,
1818    field: &'static str,
1819) -> Result<u64, ZhArtifactPayloadError> {
1820    let end = offset
1821        .checked_add(8)
1822        .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1823    let chunk = bytes
1824        .get(offset..end)
1825        .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1826    Ok(u64::from_le_bytes(
1827        chunk.try_into().expect("slice length is 8"),
1828    ))
1829}
1830
1831fn checked_indexed_usize(field: &'static str, len: u64) -> Result<usize, ZhArtifactPayloadError> {
1832    usize::try_from(len).map_err(|_| ZhArtifactPayloadError::IndexedSectionTooLarge { field, len })
1833}
1834
1835fn check_payload_file_size(path: &Path) -> Result<(), ZhArtifactPayloadError> {
1836    let len = std::fs::metadata(path)?.len();
1837    if len > MAX_ARTIFACT_PAYLOAD_BYTES {
1838        return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
1839            field: "payload_bytes",
1840            len,
1841            max: MAX_ARTIFACT_PAYLOAD_BYTES,
1842        });
1843    }
1844    Ok(())
1845}
1846
1847fn check_limit(field: &'static str, len: usize, max: usize) -> Result<(), ZhArtifactPayloadError> {
1848    if len > max {
1849        return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
1850            field,
1851            len: len as u64,
1852            max: max as u64,
1853        });
1854    }
1855    Ok(())
1856}
1857
1858fn write_indexed_reading_block(
1859    writer: &mut Vec<u8>,
1860    readings: &[String],
1861) -> Result<(), ZhArtifactPayloadError> {
1862    write_u32_len(writer, "reading_count", readings.len())?;
1863    for reading in readings {
1864        write_binary_string(writer, "reading", reading)?;
1865    }
1866    Ok(())
1867}
1868
1869impl IndexedZhPayload {
1870    fn validate(&self, pinyin_view: PinyinView) -> Result<(), ZhArtifactPayloadError> {
1871        let mut stream = self.map.stream();
1872        while let Some((surface, offset)) = stream.next() {
1873            let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
1874                ZhArtifactPayloadError::InvalidIndexedUtf8 {
1875                    field: "surface",
1876                    source,
1877                }
1878            })?;
1879            if surface.is_empty() {
1880                return Err(ZhArtifactPayloadError::EmptySurface { entry_index: 0 });
1881            }
1882            let readings = self.readings_at(offset)?;
1883            if readings.is_empty() {
1884                return Err(ZhArtifactPayloadError::EmptyReadings { surface });
1885            }
1886            let mut seen = BTreeSet::new();
1887            for (reading_index, reading) in readings.iter().enumerate() {
1888                if reading.is_empty() {
1889                    return Err(ZhArtifactPayloadError::EmptyReading {
1890                        surface: surface.clone(),
1891                        reading_index,
1892                    });
1893                }
1894                let normalized = normalize_artifact_reading(reading, pinyin_view);
1895                if normalized != *reading {
1896                    return Err(ZhArtifactPayloadError::ReadingNotNormalized {
1897                        surface: surface.clone(),
1898                        reading: reading.clone(),
1899                        normalized,
1900                    });
1901                }
1902                if !seen.insert(reading) {
1903                    return Err(ZhArtifactPayloadError::DuplicateReading {
1904                        surface: surface.clone(),
1905                        reading: reading.clone(),
1906                    });
1907                }
1908            }
1909        }
1910        Ok(())
1911    }
1912
1913    fn readings(&self, surface: &str) -> Result<Option<Vec<String>>, ZhArtifactPayloadError> {
1914        self.map
1915            .get(surface)
1916            .map(|offset| self.readings_at(offset))
1917            .transpose()
1918    }
1919
1920    fn entries(&self) -> Result<Vec<ZhReadingIndexPayloadEntry>, ZhArtifactPayloadError> {
1921        let mut entries = Vec::with_capacity(self.entries);
1922        let mut stream = self.map.stream();
1923        while let Some((surface, offset)) = stream.next() {
1924            let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
1925                ZhArtifactPayloadError::InvalidIndexedUtf8 {
1926                    field: "surface",
1927                    source,
1928                }
1929            })?;
1930            let readings = self.readings_at(offset)?;
1931            entries.push(ZhReadingIndexPayloadEntry { surface, readings });
1932        }
1933        Ok(entries)
1934    }
1935
1936    fn readings_at(&self, offset: u64) -> Result<Vec<String>, ZhArtifactPayloadError> {
1937        read_indexed_readings_at_bytes(&self.mmap, self.readings_start, offset)
1938    }
1939}
1940
1941fn read_indexed_readings_at_bytes(
1942    bytes: &[u8],
1943    readings_start: usize,
1944    offset: u64,
1945) -> Result<Vec<String>, ZhArtifactPayloadError> {
1946    let offset = usize::try_from(offset)
1947        .map_err(|_| ZhArtifactPayloadError::InvalidIndexedOffset { offset })?;
1948    let start =
1949        readings_start
1950            .checked_add(offset)
1951            .ok_or(ZhArtifactPayloadError::InvalidIndexedOffset {
1952                offset: offset as u64,
1953            })?;
1954    if start >= bytes.len() {
1955        return Err(ZhArtifactPayloadError::InvalidIndexedOffset {
1956            offset: offset as u64,
1957        });
1958    }
1959    let mut cursor = start;
1960    let reading_count = read_u32_le_bytes(bytes, cursor, "reading_count")? as usize;
1961    check_limit(
1962        "reading_count",
1963        reading_count,
1964        MAX_ARTIFACT_READINGS_PER_ENTRY,
1965    )?;
1966    cursor += 4;
1967    let mut readings = Vec::with_capacity(reading_count);
1968    for _ in 0..reading_count {
1969        let len = read_u32_le_bytes(bytes, cursor, "reading_len")? as usize;
1970        check_limit("reading_bytes", len, MAX_ARTIFACT_STRING_BYTES)?;
1971        cursor += 4;
1972        let end = cursor
1973            .checked_add(len)
1974            .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
1975        let reading_bytes = bytes
1976            .get(cursor..end)
1977            .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
1978        let reading = String::from_utf8(reading_bytes.to_vec()).map_err(|source| {
1979            ZhArtifactPayloadError::InvalidIndexedUtf8 {
1980                field: "reading",
1981                source,
1982            }
1983        })?;
1984        readings.push(reading);
1985        cursor = end;
1986    }
1987    Ok(readings)
1988}
1989
1990fn can_build_direct_pinyin_path(surface: &str) -> bool {
1991    !surface.is_empty() && surface.is_ascii()
1992}
1993
1994/// Computes the SHA-256 file digest string for a Chinese artifact payload file.
1995pub fn artifact_file_digest_path(path: impl AsRef<Path>) -> Result<String, std::io::Error> {
1996    let file = File::open(path)?;
1997    artifact_file_digest_reader(file)
1998}
1999
2000/// Computes the SHA-256 file digest string from a reader.
2001pub fn artifact_file_digest_reader(mut reader: impl Read) -> Result<String, std::io::Error> {
2002    let mut hasher = Sha256::new();
2003    let mut buffer = [0_u8; 64 * 1024];
2004    loop {
2005        let read = reader.read(&mut buffer)?;
2006        if read == 0 {
2007            break;
2008        }
2009        hasher.update(&buffer[..read]);
2010    }
2011    Ok(sha256_digest_hex(hasher.finalize()))
2012}
2013
2014fn validate_artifact_payload_header(
2015    payload: &ZhReadingIndexPayload,
2016) -> Result<(), ZhArtifactPayloadError> {
2017    if payload.schema_version != ARTIFACT_PAYLOAD_SCHEMA_VERSION {
2018        return Err(ZhArtifactPayloadError::UnsupportedSchemaVersion {
2019            version: payload.schema_version,
2020        });
2021    }
2022    if payload.payload_type != ARTIFACT_PAYLOAD_TYPE {
2023        return Err(ZhArtifactPayloadError::UnsupportedPayloadType {
2024            payload_type: payload.payload_type.clone(),
2025        });
2026    }
2027    Ok(())
2028}
2029
2030fn canonical_payload_bytes(payload: &ZhReadingIndexPayload) -> Vec<u8> {
2031    let mut bytes = Vec::new();
2032    bytes.extend_from_slice(b"moine.zh.reading-index.surface-readings/v1\n");
2033    push_len_prefixed(&mut bytes, b"V", &payload.pinyin_view);
2034    for entry in &payload.entries {
2035        push_len_prefixed(&mut bytes, b"S", &entry.surface);
2036        bytes.extend_from_slice(format!("R{}\n", entry.readings.len()).as_bytes());
2037        for reading in &entry.readings {
2038            push_len_prefixed(&mut bytes, b"r", reading);
2039        }
2040    }
2041    bytes
2042}
2043
2044fn push_len_prefixed(bytes: &mut Vec<u8>, tag: &[u8], value: &str) {
2045    bytes.extend_from_slice(tag);
2046    bytes.extend_from_slice(value.len().to_string().as_bytes());
2047    bytes.push(b'\n');
2048    bytes.extend_from_slice(value.as_bytes());
2049    bytes.push(b'\n');
2050}
2051
2052fn sha256_hex(bytes: &[u8]) -> String {
2053    sha256_digest_hex(Sha256::digest(bytes))
2054}
2055
2056fn sha256_digest_hex(digest: impl IntoIterator<Item = u8>) -> String {
2057    let mut output = String::with_capacity(64);
2058    for byte in digest {
2059        write!(&mut output, "{byte:02x}").expect("writing to String should not fail");
2060    }
2061    output
2062}
2063
2064struct CedictEntry<'a> {
2065    traditional: &'a str,
2066    simplified: &'a str,
2067    pinyin: &'a str,
2068}
2069
2070fn parse_cedict_entry(line: &str, line_number: usize) -> Result<CedictEntry<'_>, CedictError> {
2071    let (traditional, rest) = take_token(line)
2072        .ok_or_else(|| invalid_entry(line_number, "missing traditional surface"))?;
2073    let (simplified, rest) = take_token(rest.trim_start())
2074        .ok_or_else(|| invalid_entry(line_number, "missing simplified surface"))?;
2075    let rest = rest.trim_start();
2076
2077    let (pinyin, rest) = if let Some(after_open) = rest.strip_prefix("[[") {
2078        let Some(end) = after_open.find("]]") else {
2079            return Err(invalid_entry(line_number, "missing closing ]] for pinyin"));
2080        };
2081        (&after_open[..end], &after_open[end + 2..])
2082    } else if let Some(after_open) = rest.strip_prefix('[') {
2083        let Some(end) = after_open.find(']') else {
2084            return Err(invalid_entry(line_number, "missing closing ] for pinyin"));
2085        };
2086        (&after_open[..end], &after_open[end + 1..])
2087    } else {
2088        return Err(invalid_entry(line_number, "missing pinyin bracket"));
2089    };
2090
2091    if pinyin.is_empty() {
2092        return Err(invalid_entry(line_number, "empty pinyin field"));
2093    }
2094    if !rest.trim_start().starts_with('/') {
2095        return Err(invalid_entry(line_number, "missing definition slash"));
2096    }
2097
2098    Ok(CedictEntry {
2099        traditional,
2100        simplified,
2101        pinyin,
2102    })
2103}
2104
2105fn invalid_entry(line: usize, message: impl Into<String>) -> CedictError {
2106    CedictError::InvalidEntry {
2107        line,
2108        message: message.into(),
2109    }
2110}
2111
2112fn take_token(input: &str) -> Option<(&str, &str)> {
2113    let input = input.trim_start();
2114    if input.is_empty() {
2115        return None;
2116    }
2117    for (index, ch) in input.char_indices() {
2118        if ch.is_whitespace() {
2119            return Some((&input[..index], &input[index..]));
2120        }
2121    }
2122    Some((input, ""))
2123}
2124
2125#[cfg(test)]
2126mod tests {
2127    use super::*;
2128
2129    #[test]
2130    fn normalizes_pinyin_views() {
2131        assert_eq!(
2132            normalize_pinyin("Wei1 shi4 ji4", PinyinView::NoTone),
2133            "weishiji"
2134        );
2135        assert_eq!(
2136            normalize_pinyin("Wei1 shi4 ji4", PinyinView::Tone3),
2137            "wei1shi4ji4"
2138        );
2139        assert_eq!(normalize_pinyin("nu:3 er2", PinyinView::NoTone), "nver");
2140        assert_eq!(normalize_pinyin("nu:3 er2", PinyinView::Tone3), "nv3er2");
2141        assert_eq!(normalize_pinyin("hua1 r5", PinyinView::NoTone), "huar");
2142        assert_eq!(normalize_pinyin("11 Qu1", PinyinView::NoTone), "11qu");
2143        assert_eq!(normalize_pinyin("Shuang1 11", PinyinView::NoTone), "shuang");
2144        assert_eq!(
2145            normalize_pinyin("D N A jian4 ding4", PinyinView::NoTone),
2146            "dnajianding"
2147        );
2148    }
2149
2150    #[test]
2151    fn builds_no_tone_index_from_cedict() {
2152        let cedict = "\
2153# CC-CEDICT
2154威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2155布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2156女兒 女儿 [nu:3 er2] /daughter/
2157";
2158        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2159
2160        assert_eq!(index.pinyin_view(), PinyinView::NoTone);
2161        assert_eq!(
2162            index.readings("威士忌").as_deref(),
2163            Some(&["weishiji".to_string()][..])
2164        );
2165        assert_eq!(
2166            index.readings("布纳哈本").as_deref(),
2167            Some(&["bunahaben".to_string()][..])
2168        );
2169        assert_eq!(
2170            index.readings("女儿").as_deref(),
2171            Some(&["nver".to_string()][..])
2172        );
2173    }
2174
2175    #[test]
2176    fn builds_tone3_index_when_requested() {
2177        let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2178        let index = CedictReadingIndex::from_cedict_reader_with_options(
2179            cedict.as_bytes(),
2180            CedictIndexOptions {
2181                pinyin_view: PinyinView::Tone3,
2182                ..CedictIndexOptions::default()
2183            },
2184        )
2185        .unwrap();
2186
2187        assert_eq!(index.pinyin_view(), PinyinView::Tone3);
2188        assert_eq!(
2189            index.readings("威士忌").as_deref(),
2190            Some(&["wei1shi4ji4".to_string()][..])
2191        );
2192    }
2193
2194    #[test]
2195    fn deduplicates_after_normalization() {
2196        let cedict = "\
2197樂 乐 [Le4] /surname Le/
2198樂 乐 [le4] /happy/
2199樂 乐 [Yue4] /surname Yue/
2200";
2201        let no_tone = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2202        let tone3 = CedictReadingIndex::from_cedict_reader_with_options(
2203            cedict.as_bytes(),
2204            CedictIndexOptions {
2205                pinyin_view: PinyinView::Tone3,
2206                ..CedictIndexOptions::default()
2207            },
2208        )
2209        .unwrap();
2210
2211        assert_eq!(
2212            no_tone.readings("乐").as_deref(),
2213            Some(&["le".to_string(), "yue".to_string()][..])
2214        );
2215        assert_eq!(
2216            tone3.readings("乐").as_deref(),
2217            Some(&["le4".to_string(), "yue4".to_string()][..])
2218        );
2219    }
2220
2221    #[test]
2222    fn rejects_malformed_entries() {
2223        let err = CedictReadingIndex::from_cedict_reader(
2224            "威士忌 威士忌 Wei1 shi4 ji4 /whisky/\n".as_bytes(),
2225        )
2226        .unwrap_err();
2227
2228        assert!(matches!(err, CedictError::InvalidEntry { line: 1, .. }));
2229    }
2230
2231    #[test]
2232    fn computes_dictionary_paths_and_stats() {
2233        let cedict = "\
2234威 威 [wei1] /power/
2235士忌 士忌 [shi4 ji4] /whisky transcription tail/
2236威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2237";
2238        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2239        let expansion = index.reading_paths_with_stats(
2240            "威士忌",
2241            PinyinReadingOptions {
2242                longest_match_only: true,
2243                ..PinyinReadingOptions::default()
2244            },
2245        );
2246
2247        assert_eq!(expansion.paths.len(), 1);
2248        assert_eq!(expansion.paths[0].joined_reading, "weishiji");
2249        assert_eq!(
2250            expansion.paths[0].segments,
2251            vec![PinyinReadingSegment {
2252                surface: "威士忌".to_string(),
2253                reading: "weishiji".to_string(),
2254            }]
2255        );
2256        assert_eq!(expansion.stats.longest_match_pruned_spans, 1);
2257    }
2258
2259    #[test]
2260    fn hybrid_paths_allow_ascii_prefix_and_dictionary_tail() {
2261        let cedict = "忌 忌 [ji4] /whisky transcription character/\n";
2262        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2263        let paths = index.hybrid_reading_paths("weishi忌", PinyinReadingOptions::default());
2264
2265        assert_eq!(paths.len(), 1);
2266        assert_eq!(paths[0].joined_reading, "weishiji");
2267    }
2268
2269    #[test]
2270    fn compare_matches_pinyin_input_to_chinese_surface() {
2271        let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2272        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2273        let distances = compare_with_cedict_index(
2274            "weishiji",
2275            "威士忌",
2276            &index,
2277            PinyinReadingOptions::default(),
2278        )
2279        .unwrap();
2280
2281        assert_eq!(distances.lattice, 0);
2282        assert_eq!(distances.lattice_damerau, 0);
2283        assert!(distances.surface_damerau > distances.lattice);
2284    }
2285
2286    #[test]
2287    fn lattice_damerau_counts_adjacent_pinyin_transposition() {
2288        let distances = compare_with_cedict_index(
2289            "weishiji",
2290            "wieshiji",
2291            &CedictReadingIndex::default(),
2292            PinyinReadingOptions::default(),
2293        )
2294        .unwrap();
2295
2296        assert_eq!(distances.lattice, 2);
2297        assert_eq!(distances.lattice_damerau, 1);
2298    }
2299
2300    #[test]
2301    fn normalized_similarity_matches_pinyin_input_to_chinese_surface() {
2302        let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2303        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2304        let similarity = normalized_similarity_with_zh_index(
2305            "weishiji",
2306            "威士忌",
2307            &index,
2308            PinyinReadingOptions::default(),
2309        )
2310        .unwrap();
2311
2312        assert_eq!(similarity, 1.0);
2313    }
2314
2315    #[test]
2316    fn emits_and_loads_artifact_payload() {
2317        let cedict = "\
2318威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2319布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2320";
2321        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2322        let payload = index.artifact_payload();
2323        let loaded = ZhReadingIndex::from_artifact_payload(payload).unwrap();
2324
2325        assert_eq!(loaded.pinyin_view(), PinyinView::NoTone);
2326        assert_eq!(
2327            loaded.readings("威士忌").as_deref(),
2328            Some(&["weishiji".to_string()][..])
2329        );
2330        assert_eq!(
2331            loaded.readings("布纳哈本").as_deref(),
2332            Some(&["bunahaben".to_string()][..])
2333        );
2334        assert_eq!(
2335            loaded.artifact_payload_checksum(),
2336            index.artifact_payload_checksum()
2337        );
2338    }
2339
2340    #[test]
2341    fn indexed_artifact_payload_round_trips_and_supports_lookup() {
2342        let cedict = "\
2343威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2344布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2345";
2346        let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2347        let mut bytes = Vec::new();
2348        index.write_indexed_artifact_payload(&mut bytes).unwrap();
2349        let path = std::env::temp_dir().join(format!(
2350            "moine-zh-indexed-test-{}-{}.moineidx",
2351            std::process::id(),
2352            std::time::SystemTime::now()
2353                .duration_since(std::time::UNIX_EPOCH)
2354                .unwrap()
2355                .as_nanos()
2356        ));
2357        std::fs::write(&path, &bytes).unwrap();
2358        let loaded = ZhReadingIndex::from_indexed_artifact_payload_path(&path).unwrap();
2359        std::fs::remove_file(&path).unwrap();
2360        let loaded_from_bytes = ZhReadingIndex::from_indexed_artifact_payload_bytes(&bytes)
2361            .expect("indexed payload bytes should load");
2362
2363        assert_eq!(loaded.pinyin_view(), PinyinView::NoTone);
2364        assert_eq!(
2365            loaded.readings("威士忌").as_deref(),
2366            Some(&["weishiji".to_string()][..])
2367        );
2368        assert_eq!(
2369            loaded_from_bytes.artifact_payload(),
2370            index.artifact_payload()
2371        );
2372        assert_eq!(
2373            loaded.readings("布纳哈本").as_deref(),
2374            Some(&["bunahaben".to_string()][..])
2375        );
2376        assert_eq!(
2377            loaded.artifact_payload_checksum(),
2378            index.artifact_payload_checksum()
2379        );
2380    }
2381
2382    #[test]
2383    fn artifact_metadata_records_build_and_license() {
2384        let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2385        let options = CedictIndexOptions {
2386            pinyin_view: PinyinView::Tone3,
2387            max_readings_per_surface: Some(4),
2388        };
2389        let index = CedictReadingIndex::from_cedict_reader_with_options(cedict.as_bytes(), options)
2390            .unwrap();
2391        let metadata = index.artifact_metadata(ZhArtifactMetadataOptions {
2392            artifact_name: "moine-cedict-test".to_string(),
2393            generator: "test".to_string(),
2394            payload_file_name: "payload.yaml".to_string(),
2395            payload_format: "yaml.surface-readings.v1".to_string(),
2396            source_name: "CC-CEDICT".to_string(),
2397            source_version: "2026-05-20".to_string(),
2398            source_cedict: "cedict.txt".to_string(),
2399            index_options: options,
2400            query_defaults: PinyinReadingOptions {
2401                longest_match_only: true,
2402                ..PinyinReadingOptions::default()
2403            },
2404            license: ZhArtifactLicense::default(),
2405        });
2406
2407        assert_eq!(metadata.artifact_type, "moine.zh.reading-index");
2408        assert_eq!(metadata.build.pinyin_view, "tone3");
2409        assert_eq!(metadata.build.max_readings_per_surface, Some(4));
2410        assert!(metadata.query_defaults.longest_match_only);
2411        assert_eq!(metadata.license.selected_license, "CC BY-SA 4.0");
2412    }
2413
2414    #[test]
2415    fn rejects_duplicate_artifact_surface() {
2416        let payload = ZhReadingIndexPayload {
2417            schema_version: 1,
2418            payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2419            pinyin_view: "no-tone".to_string(),
2420            entries: vec![
2421                ZhReadingIndexPayloadEntry {
2422                    surface: "威士忌".to_string(),
2423                    readings: vec!["weishiji".to_string()],
2424                },
2425                ZhReadingIndexPayloadEntry {
2426                    surface: "威士忌".to_string(),
2427                    readings: vec!["weishiji".to_string()],
2428                },
2429            ],
2430        };
2431        let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2432
2433        assert!(matches!(
2434            err,
2435            ZhArtifactPayloadError::DuplicateSurface { .. }
2436        ));
2437    }
2438
2439    #[test]
2440    fn rejects_artifact_payload_excessive_reading_count() {
2441        let payload = ZhReadingIndexPayload {
2442            schema_version: 1,
2443            payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2444            pinyin_view: "no-tone".to_string(),
2445            entries: vec![ZhReadingIndexPayloadEntry {
2446                surface: "威士忌".to_string(),
2447                readings: vec!["weishiji".to_string(); MAX_ARTIFACT_READINGS_PER_ENTRY + 1],
2448            }],
2449        };
2450        let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2451
2452        assert!(matches!(
2453            err,
2454            ZhArtifactPayloadError::ArtifactLimitExceeded {
2455                field: "reading_count",
2456                ..
2457            }
2458        ));
2459    }
2460
2461    #[test]
2462    fn rejects_non_normalized_artifact_reading() {
2463        let payload = ZhReadingIndexPayload {
2464            schema_version: 1,
2465            payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2466            pinyin_view: "no-tone".to_string(),
2467            entries: vec![ZhReadingIndexPayloadEntry {
2468                surface: "威士忌".to_string(),
2469                readings: vec!["Wei1shi4ji4".to_string()],
2470            }],
2471        };
2472        let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2473
2474        assert!(matches!(
2475            err,
2476            ZhArtifactPayloadError::ReadingNotNormalized { .. }
2477        ));
2478    }
2479
2480    #[test]
2481    fn no_tone_artifact_rejects_tone_digits_after_letters() {
2482        let payload = ZhReadingIndexPayload {
2483            schema_version: 1,
2484            payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2485            pinyin_view: "no-tone".to_string(),
2486            entries: vec![ZhReadingIndexPayloadEntry {
2487                surface: "威士忌".to_string(),
2488                readings: vec!["wei1shi4ji4".to_string()],
2489            }],
2490        };
2491        let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2492
2493        assert!(matches!(
2494            err,
2495            ZhArtifactPayloadError::ReadingNotNormalized { .. }
2496        ));
2497    }
2498
2499    #[test]
2500    fn artifact_validation_keeps_numeric_tokens_in_no_tone_view() {
2501        let payload = ZhReadingIndexPayload {
2502            schema_version: 1,
2503            payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2504            pinyin_view: "no-tone".to_string(),
2505            entries: vec![ZhReadingIndexPayloadEntry {
2506                surface: "11区".to_string(),
2507                readings: vec!["11qu".to_string()],
2508            }],
2509        };
2510        let index = ZhReadingIndex::from_artifact_payload(payload).unwrap();
2511
2512        assert_eq!(
2513            index.readings("11区").as_deref(),
2514            Some(&["11qu".to_string()][..])
2515        );
2516    }
2517
2518    #[test]
2519    fn tone3_view_preserves_tone_digits() {
2520        let cedict = "重 重 [chong2] /again/\n重 重 [zhong4] /heavy/\n";
2521        let index = CedictReadingIndex::from_cedict_reader_with_options(
2522            cedict.as_bytes(),
2523            CedictIndexOptions {
2524                pinyin_view: PinyinView::Tone3,
2525                ..CedictIndexOptions::default()
2526            },
2527        )
2528        .unwrap();
2529        let distances =
2530            compare_with_cedict_index("zhong4", "重", &index, PinyinReadingOptions::default())
2531                .unwrap();
2532
2533        assert_eq!(distances.lattice, 0);
2534    }
2535
2536    #[test]
2537    fn unknown_han_without_dictionary_path_is_rejected() {
2538        let index = CedictReadingIndex::default();
2539        let err =
2540            cedict_or_direct_lattice("印", &index, PinyinReadingOptions::default()).unwrap_err();
2541
2542        assert!(matches!(err, CnLatticeError::UnsupportedDirectInput { .. }));
2543    }
2544}