1#![deny(missing_docs)]
40
41use std::borrow::Cow;
42use std::collections::{btree_map::Entry, BTreeMap, BTreeSet, HashMap};
43use std::error::Error;
44use std::fmt;
45use std::fmt::Write as _;
46use std::fs::File;
47use std::io::{BufRead, BufReader, Read, Write};
48use std::path::Path;
49use std::string::FromUtf8Error;
50use std::sync::Arc;
51
52use fst::{Map, MapBuilder, Streamer};
53use memmap2::Mmap;
54use moine_core::{
55 damerau_distance, damerau_levenshtein_str, distance, levenshtein_str,
56 normalized_similarity_str, Lattice,
57};
58use serde::{Deserialize, Serialize};
59use sha2::{Digest, Sha256};
60
61const ARTIFACT_PAYLOAD_SCHEMA_VERSION: u32 = 1;
62const ARTIFACT_PAYLOAD_TYPE: &str = "moine.zh.reading-index.surface-readings";
63const INDEXED_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEZ01";
64const INDEXED_ARTIFACT_VERSION: u32 = 1;
65const INDEXED_ARTIFACT_HEADER_LEN: usize = 40;
66const MAX_ARTIFACT_PAYLOAD_BYTES: u64 = 512 * 1024 * 1024;
67const MAX_ARTIFACT_ENTRIES: usize = 2_000_000;
68const MAX_ARTIFACT_READINGS_PER_ENTRY: usize = 256;
69const MAX_ARTIFACT_STRING_BYTES: usize = 16 * 1024;
70pub const ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "sha256-canonical-v1";
72pub const ARTIFACT_PAYLOAD_FILE_DIGEST_ALGORITHM: &str = "sha256-file-v1";
74
75#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
77pub enum PinyinView {
78 #[default]
80 NoTone,
81 Tone3,
83}
84
85#[derive(Clone, Copy, Debug, Eq, PartialEq)]
87pub struct CedictIndexOptions {
88 pub pinyin_view: PinyinView,
90 pub max_readings_per_surface: Option<usize>,
92}
93
94#[derive(Clone, Copy, Debug, Eq, PartialEq)]
96pub struct PinyinReadingOptions {
97 pub max_span_chars: usize,
99 pub max_paths: usize,
101 pub longest_match_only: bool,
103 pub max_readings_per_segment: Option<usize>,
105}
106
107#[derive(Clone, Debug, Eq, PartialEq)]
109pub struct PinyinReadingSegment {
110 pub surface: String,
112 pub reading: String,
114}
115
116#[derive(Clone, Debug, Eq, PartialEq)]
118pub struct PinyinReadingPath {
119 pub segments: Vec<PinyinReadingSegment>,
121 pub joined_reading: String,
123}
124
125#[derive(Clone, Debug, Default, Eq, PartialEq)]
127pub struct PinyinReadingExpansion {
128 pub paths: Vec<PinyinReadingPath>,
130 pub stats: PinyinReadingStats,
132}
133
134#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
136pub struct PinyinReadingStats {
137 pub matched_spans: usize,
139 pub direct_fallback_spans: usize,
141 pub longest_match_pruned_spans: usize,
143 pub raw_segment_readings: usize,
145 pub used_segment_readings: usize,
147 pub pruned_segment_readings: usize,
149 pub candidate_combinations: usize,
151 pub unique_paths: usize,
153 pub duplicate_joined_readings: usize,
155 pub max_paths_hit_count: usize,
157}
158
159#[derive(Clone, Copy, Debug, Eq, PartialEq)]
161pub struct ChineseDistance {
162 pub surface_levenshtein: usize,
164 pub surface_damerau: usize,
166 pub lattice: usize,
168 pub lattice_damerau: usize,
170 pub combined: usize,
176}
177
178pub type ZhReadingIndex = CedictReadingIndex;
180
181#[derive(Clone, Debug)]
183pub struct CedictReadingIndex {
184 storage: ZhReadingStorage,
185 pinyin_view: PinyinView,
186}
187
188#[derive(Clone, Debug)]
189enum ZhReadingStorage {
190 Eager(HashMap<String, Vec<String>>),
191 Indexed(IndexedZhPayload),
192}
193
194impl Default for CedictReadingIndex {
195 fn default() -> Self {
196 Self {
197 storage: ZhReadingStorage::Eager(HashMap::new()),
198 pinyin_view: PinyinView::default(),
199 }
200 }
201}
202
203impl PartialEq for CedictReadingIndex {
204 fn eq(&self, other: &Self) -> bool {
205 self.pinyin_view == other.pinyin_view && self.artifact_payload() == other.artifact_payload()
206 }
207}
208
209impl Eq for CedictReadingIndex {}
210
211#[derive(Clone, Debug)]
212struct IndexedZhPayload {
213 mmap: Arc<Mmap>,
214 map: Map<Vec<u8>>,
215 readings_start: usize,
216 entries: usize,
217}
218
219#[derive(Clone, Copy, Debug, Eq, PartialEq)]
221pub struct ZhIndexedArtifactPayloadHeader {
222 pub version: u32,
224 pub pinyin_view: PinyinView,
226 pub entries: usize,
228 pub fst_len: usize,
230 pub readings_len: usize,
232}
233
234#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
236pub struct ZhArtifactMetadata {
237 pub schema_version: u32,
239 pub artifact_type: String,
241 pub artifact_name: String,
243 pub generator: String,
245 pub payload: ZhArtifactPayload,
247 pub source: ZhArtifactSource,
249 pub build: ZhArtifactBuild,
251 pub query_defaults: ZhArtifactQueryDefaults,
253 pub license: ZhArtifactLicense,
255}
256
257#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
259pub struct ZhArtifactPayload {
260 pub path: String,
262 pub format: String,
264 #[serde(default, skip_serializing_if = "Option::is_none")]
266 pub file_digest_algorithm: Option<String>,
267 #[serde(default, skip_serializing_if = "Option::is_none")]
269 pub file_digest: Option<String>,
270 pub checksum_algorithm: String,
272 pub checksum: String,
274}
275
276#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
278pub struct ZhArtifactSource {
279 pub name: String,
281 pub version: String,
283 pub cedict: String,
285}
286
287#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
289pub struct ZhArtifactBuild {
290 pub pinyin_view: String,
292 pub max_readings_per_surface: Option<usize>,
294 pub entries: usize,
296}
297
298#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
300pub struct ZhArtifactQueryDefaults {
301 pub max_span_chars: usize,
303 pub max_paths: usize,
305 pub longest_match_only: bool,
307 pub max_readings_per_segment: Option<usize>,
309}
310
311#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
313pub struct ZhArtifactLicense {
314 pub selected_license: String,
316 pub references: Vec<ZhArtifactLicenseReference>,
318}
319
320#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
322pub struct ZhArtifactLicenseReference {
323 pub label: String,
325 pub path: String,
327}
328
329#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
331pub struct ZhReadingIndexPayload {
332 pub schema_version: u32,
334 pub payload_type: String,
336 pub pinyin_view: String,
338 pub entries: Vec<ZhReadingIndexPayloadEntry>,
340}
341
342#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
344pub struct ZhReadingIndexPayloadEntry {
345 pub surface: String,
347 pub readings: Vec<String>,
349}
350
351#[derive(Clone, Debug, Eq, PartialEq)]
353pub struct ZhArtifactMetadataOptions {
354 pub artifact_name: String,
356 pub generator: String,
358 pub payload_file_name: String,
360 pub payload_format: String,
362 pub source_name: String,
364 pub source_version: String,
366 pub source_cedict: String,
368 pub index_options: CedictIndexOptions,
370 pub query_defaults: PinyinReadingOptions,
372 pub license: ZhArtifactLicense,
374}
375
376#[derive(Debug)]
378pub enum CedictError {
379 Io(std::io::Error),
381 InvalidEntry {
383 line: usize,
385 message: String,
387 },
388}
389
390#[derive(Debug)]
392pub enum ZhArtifactPayloadError {
393 Io(std::io::Error),
395 Yaml(serde_yaml::Error),
397 InvalidIndexedMagic {
399 magic: [u8; 8],
401 },
402 UnsupportedIndexedVersion {
404 version: u32,
406 },
407 UnsupportedIndexedPinyinView {
409 value: u32,
411 },
412 IndexedSectionTooLarge {
414 field: &'static str,
416 len: u64,
418 },
419 ArtifactLimitExceeded {
421 field: &'static str,
423 len: u64,
425 max: u64,
427 },
428 NonZeroIndexedReserved {
430 value: u32,
432 },
433 TruncatedIndexed {
435 field: &'static str,
437 },
438 InvalidIndexedFst {
440 message: String,
442 },
443 IndexedEntryCountMismatch {
445 header_entries: usize,
447 fst_entries: usize,
449 },
450 InvalidIndexedOffset {
452 offset: u64,
454 },
455 InvalidIndexedUtf8 {
457 field: &'static str,
459 source: FromUtf8Error,
461 },
462 UnsupportedSchemaVersion {
464 version: u32,
466 },
467 UnsupportedPayloadType {
469 payload_type: String,
471 },
472 UnsupportedPinyinView {
474 pinyin_view: String,
476 },
477 EmptySurface {
479 entry_index: usize,
481 },
482 DuplicateSurface {
484 surface: String,
486 },
487 EmptyReadings {
489 surface: String,
491 },
492 EmptyReading {
494 surface: String,
496 reading_index: usize,
498 },
499 DuplicateReading {
501 surface: String,
503 reading: String,
505 },
506 ReadingNotNormalized {
508 surface: String,
510 reading: String,
512 normalized: String,
514 },
515}
516
517#[derive(Debug, Eq, PartialEq)]
519pub enum CnLatticeError {
520 EmptyReadings,
522 UnsupportedDirectInput {
524 surface: String,
526 },
527 ArtifactPayload(String),
529}
530
531impl PinyinView {
532 pub fn as_str(self) -> &'static str {
534 match self {
535 Self::NoTone => "no-tone",
536 Self::Tone3 => "tone3",
537 }
538 }
539}
540
541impl TryFrom<&str> for PinyinView {
542 type Error = ();
543
544 fn try_from(value: &str) -> Result<Self, Self::Error> {
545 match value {
546 "no-tone" | "notone" | "normal" => Ok(Self::NoTone),
547 "tone3" => Ok(Self::Tone3),
548 _ => Err(()),
549 }
550 }
551}
552
553impl Default for CedictIndexOptions {
554 fn default() -> Self {
555 Self {
556 pinyin_view: PinyinView::NoTone,
557 max_readings_per_surface: None,
558 }
559 }
560}
561
562impl Default for PinyinReadingOptions {
563 fn default() -> Self {
564 Self {
565 max_span_chars: 8,
566 max_paths: 1024,
567 longest_match_only: false,
568 max_readings_per_segment: None,
569 }
570 }
571}
572
573impl Default for ZhArtifactLicense {
574 fn default() -> Self {
575 Self {
576 selected_license: "CC BY-SA 4.0".to_string(),
577 references: vec![ZhArtifactLicenseReference {
578 label: "CC-CEDICT".to_string(),
579 path: "license/CC-CEDICT.md".to_string(),
580 }],
581 }
582 }
583}
584
585impl fmt::Display for CedictError {
586 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
587 match self {
588 Self::Io(err) => write!(f, "failed to read CC-CEDICT: {err}"),
589 Self::InvalidEntry { line, message } => {
590 write!(f, "invalid CC-CEDICT entry at line {line}: {message}")
591 }
592 }
593 }
594}
595
596impl Error for CedictError {
597 fn source(&self) -> Option<&(dyn Error + 'static)> {
598 match self {
599 Self::Io(err) => Some(err),
600 Self::InvalidEntry { .. } => None,
601 }
602 }
603}
604
605impl fmt::Display for ZhArtifactPayloadError {
606 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
607 match self {
608 Self::Io(err) => write!(f, "failed to read zh artifact payload: {err}"),
609 Self::Yaml(err) => write!(f, "invalid zh artifact payload YAML: {err}"),
610 Self::InvalidIndexedMagic { magic } => {
611 write!(f, "invalid zh indexed artifact magic {magic:?}")
612 }
613 Self::UnsupportedIndexedVersion { version } => {
614 write!(f, "unsupported zh indexed artifact version {version}")
615 }
616 Self::UnsupportedIndexedPinyinView { value } => {
617 write!(f, "unsupported zh indexed artifact pinyin view {value}")
618 }
619 Self::IndexedSectionTooLarge { field, len } => {
620 write!(f, "zh indexed artifact {field} length {len} exceeds usize::MAX")
621 }
622 Self::ArtifactLimitExceeded { field, len, max } => {
623 write!(f, "zh artifact {field} length/count {len} exceeds limit {max}")
624 }
625 Self::NonZeroIndexedReserved { value } => {
626 write!(f, "zh indexed artifact reserved header field is {value}")
627 }
628 Self::TruncatedIndexed { field } => {
629 write!(f, "truncated zh indexed artifact while reading {field}")
630 }
631 Self::InvalidIndexedFst { message } => {
632 write!(f, "invalid zh indexed artifact FST: {message}")
633 }
634 Self::IndexedEntryCountMismatch {
635 header_entries,
636 fst_entries,
637 } => write!(
638 f,
639 "zh indexed artifact header entry count {header_entries} does not match FST entry count {fst_entries}"
640 ),
641 Self::InvalidIndexedOffset { offset } => {
642 write!(f, "invalid zh indexed artifact readings offset {offset}")
643 }
644 Self::InvalidIndexedUtf8 { field, source } => {
645 write!(f, "invalid UTF-8 in zh indexed artifact {field}: {source}")
646 }
647 Self::UnsupportedSchemaVersion { version } => {
648 write!(f, "unsupported zh artifact payload schema version {version}")
649 }
650 Self::UnsupportedPayloadType { payload_type } => {
651 write!(f, "unsupported zh artifact payload type {payload_type:?}")
652 }
653 Self::UnsupportedPinyinView { pinyin_view } => {
654 write!(f, "unsupported zh artifact pinyin view {pinyin_view:?}")
655 }
656 Self::EmptySurface { entry_index } => {
657 write!(f, "zh artifact payload entry {entry_index} has an empty surface")
658 }
659 Self::DuplicateSurface { surface } => {
660 write!(f, "zh artifact payload has duplicate surface {surface:?}")
661 }
662 Self::EmptyReadings { surface } => {
663 write!(f, "zh artifact payload surface {surface:?} has no readings")
664 }
665 Self::EmptyReading {
666 surface,
667 reading_index,
668 } => write!(
669 f,
670 "zh artifact payload surface {surface:?} has an empty reading at index {reading_index}"
671 ),
672 Self::DuplicateReading { surface, reading } => write!(
673 f,
674 "zh artifact payload surface {surface:?} has duplicate reading {reading:?}"
675 ),
676 Self::ReadingNotNormalized {
677 surface,
678 reading,
679 normalized,
680 } => write!(
681 f,
682 "zh artifact payload surface {surface:?} has non-normalized reading {reading:?}; expected {normalized:?}"
683 ),
684 }
685 }
686}
687
688impl Error for ZhArtifactPayloadError {
689 fn source(&self) -> Option<&(dyn Error + 'static)> {
690 match self {
691 Self::Io(err) => Some(err),
692 Self::Yaml(err) => Some(err),
693 Self::InvalidIndexedUtf8 { source, .. } => Some(source),
694 _ => None,
695 }
696 }
697}
698
699impl From<std::io::Error> for CedictError {
700 fn from(err: std::io::Error) -> Self {
701 Self::Io(err)
702 }
703}
704
705impl From<std::io::Error> for ZhArtifactPayloadError {
706 fn from(err: std::io::Error) -> Self {
707 Self::Io(err)
708 }
709}
710
711impl From<serde_yaml::Error> for ZhArtifactPayloadError {
712 fn from(err: serde_yaml::Error) -> Self {
713 Self::Yaml(err)
714 }
715}
716
717impl fmt::Display for CnLatticeError {
718 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719 match self {
720 Self::EmptyReadings => write!(f, "at least one pinyin reading is required"),
721 Self::UnsupportedDirectInput { surface } => {
722 write!(f, "unsupported direct pinyin input {surface:?}")
723 }
724 Self::ArtifactPayload(err) => write!(f, "{err}"),
725 }
726 }
727}
728
729impl Error for CnLatticeError {}
730
731impl CedictReadingIndex {
732 pub fn from_cedict_path(path: impl AsRef<Path>) -> Result<Self, CedictError> {
734 Self::from_cedict_path_with_options(path, CedictIndexOptions::default())
735 }
736
737 pub fn from_cedict_path_with_options(
739 path: impl AsRef<Path>,
740 options: CedictIndexOptions,
741 ) -> Result<Self, CedictError> {
742 let file = File::open(path)?;
743 Self::from_cedict_reader_with_options(file, options)
744 }
745
746 pub fn from_cedict_reader(reader: impl Read) -> Result<Self, CedictError> {
748 Self::from_cedict_reader_with_options(reader, CedictIndexOptions::default())
749 }
750
751 pub fn from_cedict_reader_with_options(
753 reader: impl Read,
754 options: CedictIndexOptions,
755 ) -> Result<Self, CedictError> {
756 let mut by_surface = HashMap::<String, BTreeSet<String>>::new();
757 let reader = BufReader::new(reader);
758 for (line_index, line) in reader.lines().enumerate() {
759 let line_number = line_index + 1;
760 let line = line?;
761 let line = line.trim_end_matches('\r');
762 if line.is_empty() || line.starts_with('#') {
763 continue;
764 }
765
766 let entry = parse_cedict_entry(line, line_number)?;
767 let reading = normalize_pinyin(entry.pinyin, options.pinyin_view);
768 if reading.is_empty() {
769 continue;
770 }
771
772 by_surface
773 .entry(entry.traditional.to_string())
774 .or_default()
775 .insert(reading.clone());
776 by_surface
777 .entry(entry.simplified.to_string())
778 .or_default()
779 .insert(reading);
780 }
781
782 let readings_by_surface = by_surface
783 .into_iter()
784 .map(|(surface, readings)| {
785 let mut readings = readings.into_iter().collect::<Vec<_>>();
786 if let Some(max_readings) = options.max_readings_per_surface {
787 readings.truncate(max_readings);
788 }
789 (surface, readings)
790 })
791 .filter(|(_, readings)| !readings.is_empty())
792 .collect();
793
794 Ok(Self {
795 storage: ZhReadingStorage::Eager(readings_by_surface),
796 pinyin_view: options.pinyin_view,
797 })
798 }
799
800 pub fn from_artifact_payload_path(
802 path: impl AsRef<Path>,
803 ) -> Result<Self, ZhArtifactPayloadError> {
804 let path = path.as_ref();
805 check_payload_file_size(path)?;
806 let file = File::open(path)?;
807 Self::from_artifact_payload_reader(file)
808 }
809
810 pub fn from_artifact_payload_reader(reader: impl Read) -> Result<Self, ZhArtifactPayloadError> {
812 let payload = serde_yaml::from_reader(reader)?;
813 Self::from_artifact_payload(payload)
814 }
815
816 pub fn from_artifact_payload(
818 payload: ZhReadingIndexPayload,
819 ) -> Result<Self, ZhArtifactPayloadError> {
820 validate_artifact_payload_header(&payload)?;
821 let pinyin_view = PinyinView::try_from(payload.pinyin_view.as_str()).map_err(|()| {
822 ZhArtifactPayloadError::UnsupportedPinyinView {
823 pinyin_view: payload.pinyin_view.clone(),
824 }
825 })?;
826 check_limit("entry_count", payload.entries.len(), MAX_ARTIFACT_ENTRIES)?;
827
828 let mut readings_by_surface = HashMap::new();
829 for (entry_index, entry) in payload.entries.into_iter().enumerate() {
830 check_limit(
831 "surface_bytes",
832 entry.surface.len(),
833 MAX_ARTIFACT_STRING_BYTES,
834 )?;
835 check_limit(
836 "reading_count",
837 entry.readings.len(),
838 MAX_ARTIFACT_READINGS_PER_ENTRY,
839 )?;
840 if entry.surface.is_empty() {
841 return Err(ZhArtifactPayloadError::EmptySurface { entry_index });
842 }
843 if entry.readings.is_empty() {
844 return Err(ZhArtifactPayloadError::EmptyReadings {
845 surface: entry.surface,
846 });
847 }
848
849 let mut seen_readings = BTreeSet::new();
850 for (reading_index, reading) in entry.readings.iter().enumerate() {
851 check_limit("reading_bytes", reading.len(), MAX_ARTIFACT_STRING_BYTES)?;
852 if reading.is_empty() {
853 return Err(ZhArtifactPayloadError::EmptyReading {
854 surface: entry.surface,
855 reading_index,
856 });
857 }
858 let normalized = normalize_artifact_reading(reading, pinyin_view);
859 if normalized != *reading {
860 return Err(ZhArtifactPayloadError::ReadingNotNormalized {
861 surface: entry.surface,
862 reading: reading.clone(),
863 normalized,
864 });
865 }
866 if !seen_readings.insert(reading) {
867 return Err(ZhArtifactPayloadError::DuplicateReading {
868 surface: entry.surface,
869 reading: reading.clone(),
870 });
871 }
872 }
873
874 if readings_by_surface
875 .insert(entry.surface.clone(), entry.readings)
876 .is_some()
877 {
878 return Err(ZhArtifactPayloadError::DuplicateSurface {
879 surface: entry.surface,
880 });
881 }
882 }
883
884 Ok(Self {
885 storage: ZhReadingStorage::Eager(readings_by_surface),
886 pinyin_view,
887 })
888 }
889
890 pub fn from_indexed_artifact_payload_path(
896 path: impl AsRef<Path>,
897 ) -> Result<Self, ZhArtifactPayloadError> {
898 let path = path.as_ref();
899 check_payload_file_size(path)?;
900 let file = File::open(path)?;
901 let mmap = unsafe { Mmap::map(&file)? };
904 Self::from_indexed_mmap(mmap)
905 }
906
907 pub fn from_indexed_artifact_payload_bytes(
918 bytes: &[u8],
919 ) -> Result<Self, ZhArtifactPayloadError> {
920 if bytes.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
921 return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
922 field: "payload_bytes",
923 len: bytes.len() as u64,
924 max: MAX_ARTIFACT_PAYLOAD_BYTES,
925 });
926 }
927 let header = read_indexed_artifact_payload_header_bytes(bytes)?;
928 let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
929 let fst_end = fst_start.checked_add(header.fst_len).ok_or(
930 ZhArtifactPayloadError::TruncatedIndexed {
931 field: "fst_section",
932 },
933 )?;
934 let readings_end = fst_end.checked_add(header.readings_len).ok_or(
935 ZhArtifactPayloadError::TruncatedIndexed {
936 field: "readings_section",
937 },
938 )?;
939 if bytes.len() < readings_end {
940 return Err(ZhArtifactPayloadError::TruncatedIndexed {
941 field: "indexed_payload",
942 });
943 }
944
945 let map = Map::new(bytes[fst_start..fst_end].to_vec()).map_err(|err| {
946 ZhArtifactPayloadError::InvalidIndexedFst {
947 message: err.to_string(),
948 }
949 })?;
950 let fst_entries = map.len();
951 if fst_entries != header.entries {
952 return Err(ZhArtifactPayloadError::IndexedEntryCountMismatch {
953 header_entries: header.entries,
954 fst_entries,
955 });
956 }
957
958 let mut entries = Vec::with_capacity(header.entries);
959 let mut stream = map.stream();
960 while let Some((surface, offset)) = stream.next() {
961 let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
962 ZhArtifactPayloadError::InvalidIndexedUtf8 {
963 field: "surface",
964 source,
965 }
966 })?;
967 let readings = read_indexed_readings_at_bytes(bytes, fst_end, offset)?;
968 entries.push(ZhReadingIndexPayloadEntry { surface, readings });
969 }
970
971 Self::from_artifact_payload(ZhReadingIndexPayload {
972 schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
973 payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
974 pinyin_view: header.pinyin_view.as_str().to_string(),
975 entries,
976 })
977 }
978
979 fn from_indexed_mmap(mmap: Mmap) -> Result<Self, ZhArtifactPayloadError> {
980 if mmap.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
981 return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
982 field: "payload_bytes",
983 len: mmap.len() as u64,
984 max: MAX_ARTIFACT_PAYLOAD_BYTES,
985 });
986 }
987 let header = read_indexed_artifact_payload_header_bytes(&mmap)?;
988 let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
989 let fst_end = fst_start.checked_add(header.fst_len).ok_or(
990 ZhArtifactPayloadError::TruncatedIndexed {
991 field: "fst_section",
992 },
993 )?;
994 let readings_end = fst_end.checked_add(header.readings_len).ok_or(
995 ZhArtifactPayloadError::TruncatedIndexed {
996 field: "readings_section",
997 },
998 )?;
999 if mmap.len() < readings_end {
1000 return Err(ZhArtifactPayloadError::TruncatedIndexed {
1001 field: "indexed_payload",
1002 });
1003 }
1004
1005 let map = Map::new(mmap[fst_start..fst_end].to_vec()).map_err(|err| {
1006 ZhArtifactPayloadError::InvalidIndexedFst {
1007 message: err.to_string(),
1008 }
1009 })?;
1010 let fst_entries = map.len();
1011 if fst_entries != header.entries {
1012 return Err(ZhArtifactPayloadError::IndexedEntryCountMismatch {
1013 header_entries: header.entries,
1014 fst_entries,
1015 });
1016 }
1017 let indexed = IndexedZhPayload {
1018 mmap: Arc::new(mmap),
1019 map,
1020 readings_start: fst_end,
1021 entries: header.entries,
1022 };
1023 indexed.validate(header.pinyin_view)?;
1024 Ok(Self {
1025 storage: ZhReadingStorage::Indexed(indexed),
1026 pinyin_view: header.pinyin_view,
1027 })
1028 }
1029
1030 pub fn pinyin_view(&self) -> PinyinView {
1032 self.pinyin_view
1033 }
1034
1035 pub fn readings(&self, surface: &str) -> Option<Cow<'_, [String]>> {
1041 self.try_readings(surface).ok().flatten()
1042 }
1043
1044 pub fn try_readings(
1047 &self,
1048 surface: &str,
1049 ) -> Result<Option<Cow<'_, [String]>>, ZhArtifactPayloadError> {
1050 match &self.storage {
1051 ZhReadingStorage::Eager(readings_by_surface) => Ok(readings_by_surface
1052 .get(surface)
1053 .map(|readings| Cow::Borrowed(readings.as_slice()))),
1054 ZhReadingStorage::Indexed(indexed) => indexed
1055 .readings(surface)
1056 .map(|readings| readings.map(Cow::Owned)),
1057 }
1058 }
1059
1060 pub fn len(&self) -> usize {
1062 match &self.storage {
1063 ZhReadingStorage::Eager(readings_by_surface) => readings_by_surface.len(),
1064 ZhReadingStorage::Indexed(indexed) => indexed.entries,
1065 }
1066 }
1067
1068 pub fn is_empty(&self) -> bool {
1070 self.len() == 0
1071 }
1072
1073 pub fn artifact_metadata(&self, options: ZhArtifactMetadataOptions) -> ZhArtifactMetadata {
1079 ZhArtifactMetadata {
1080 schema_version: 1,
1081 artifact_type: "moine.zh.reading-index".to_string(),
1082 artifact_name: options.artifact_name,
1083 generator: options.generator,
1084 payload: ZhArtifactPayload {
1085 path: options.payload_file_name,
1086 format: options.payload_format,
1087 file_digest_algorithm: None,
1088 file_digest: None,
1089 checksum_algorithm: ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM.to_string(),
1090 checksum: self.artifact_payload_checksum(),
1091 },
1092 source: ZhArtifactSource {
1093 name: options.source_name,
1094 version: options.source_version,
1095 cedict: options.source_cedict,
1096 },
1097 build: ZhArtifactBuild {
1098 pinyin_view: options.index_options.pinyin_view.as_str().to_string(),
1099 max_readings_per_surface: options.index_options.max_readings_per_surface,
1100 entries: self.len(),
1101 },
1102 query_defaults: ZhArtifactQueryDefaults {
1103 max_span_chars: options.query_defaults.max_span_chars,
1104 max_paths: options.query_defaults.max_paths,
1105 longest_match_only: options.query_defaults.longest_match_only,
1106 max_readings_per_segment: options.query_defaults.max_readings_per_segment,
1107 },
1108 license: options.license,
1109 }
1110 }
1111
1112 pub fn artifact_payload(&self) -> ZhReadingIndexPayload {
1117 let entries = match &self.storage {
1118 ZhReadingStorage::Eager(readings_by_surface) => {
1119 let mut entries = readings_by_surface
1120 .iter()
1121 .map(|(surface, readings)| ZhReadingIndexPayloadEntry {
1122 surface: surface.clone(),
1123 readings: readings.clone(),
1124 })
1125 .collect::<Vec<_>>();
1126 entries.sort_by(|left, right| left.surface.cmp(&right.surface));
1127 entries
1128 }
1129 ZhReadingStorage::Indexed(indexed) => indexed
1130 .entries()
1131 .expect("validated indexed artifact should decode"),
1132 };
1133
1134 ZhReadingIndexPayload {
1135 schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1136 payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1137 pinyin_view: self.pinyin_view.as_str().to_string(),
1138 entries,
1139 }
1140 }
1141
1142 pub fn artifact_payload_checksum(&self) -> String {
1144 self.artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
1145 .expect("default artifact checksum algorithm should be supported")
1146 }
1147
1148 pub fn artifact_payload_checksum_for_algorithm(&self, algorithm: &str) -> Option<String> {
1152 let payload = self.artifact_payload();
1153 let bytes = canonical_payload_bytes(&payload);
1154 match algorithm {
1155 ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(sha256_hex(&bytes)),
1156 _ => None,
1157 }
1158 }
1159
1160 pub fn write_indexed_artifact_payload(
1166 &self,
1167 mut writer: impl Write,
1168 ) -> Result<(), ZhArtifactPayloadError> {
1169 let payload = self.artifact_payload();
1170 let mut fst_bytes = Vec::new();
1171 let mut readings_bytes = Vec::new();
1172 {
1173 let mut builder = MapBuilder::new(&mut fst_bytes).map_err(|err| {
1174 ZhArtifactPayloadError::InvalidIndexedFst {
1175 message: err.to_string(),
1176 }
1177 })?;
1178 for entry in &payload.entries {
1179 let offset = readings_bytes.len() as u64;
1180 builder.insert(&entry.surface, offset).map_err(|err| {
1181 ZhArtifactPayloadError::InvalidIndexedFst {
1182 message: err.to_string(),
1183 }
1184 })?;
1185 write_indexed_reading_block(&mut readings_bytes, &entry.readings)?;
1186 }
1187 builder
1188 .finish()
1189 .map_err(|err| ZhArtifactPayloadError::InvalidIndexedFst {
1190 message: err.to_string(),
1191 })?;
1192 }
1193
1194 writer.write_all(INDEXED_ARTIFACT_MAGIC)?;
1195 writer.write_all(&INDEXED_ARTIFACT_VERSION.to_le_bytes())?;
1196 writer.write_all(&pinyin_view_header_value(self.pinyin_view).to_le_bytes())?;
1197 writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1198 writer.write_all(&(fst_bytes.len() as u64).to_le_bytes())?;
1199 writer.write_all(&(readings_bytes.len() as u64).to_le_bytes())?;
1200 writer.write_all(&fst_bytes)?;
1201 writer.write_all(&readings_bytes)?;
1202 Ok(())
1203 }
1204
1205 pub fn reading_sequences(&self, text: &str, options: PinyinReadingOptions) -> Vec<String> {
1211 self.reading_paths(text, options)
1212 .into_iter()
1213 .map(|path| path.joined_reading)
1214 .collect()
1215 }
1216
1217 pub fn reading_paths(
1223 &self,
1224 text: &str,
1225 options: PinyinReadingOptions,
1226 ) -> Vec<PinyinReadingPath> {
1227 self.reading_paths_with_stats(text, options).paths
1228 }
1229
1230 pub fn reading_paths_with_stats(
1236 &self,
1237 text: &str,
1238 options: PinyinReadingOptions,
1239 ) -> PinyinReadingExpansion {
1240 self.try_reading_paths_with_stats(text, options)
1241 .unwrap_or_default()
1242 }
1243
1244 pub fn try_reading_paths_with_stats(
1247 &self,
1248 text: &str,
1249 options: PinyinReadingOptions,
1250 ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1251 self.reading_paths_with_stats_inner(text, options, false)
1252 }
1253
1254 pub fn hybrid_reading_paths(
1260 &self,
1261 text: &str,
1262 options: PinyinReadingOptions,
1263 ) -> Vec<PinyinReadingPath> {
1264 self.hybrid_reading_paths_with_stats(text, options).paths
1265 }
1266
1267 pub fn hybrid_reading_paths_with_stats(
1273 &self,
1274 text: &str,
1275 options: PinyinReadingOptions,
1276 ) -> PinyinReadingExpansion {
1277 self.try_hybrid_reading_paths_with_stats(text, options)
1278 .unwrap_or_default()
1279 }
1280
1281 pub fn try_hybrid_reading_paths_with_stats(
1284 &self,
1285 text: &str,
1286 options: PinyinReadingOptions,
1287 ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1288 self.reading_paths_with_stats_inner(text, options, true)
1289 }
1290
1291 pub fn pinyin_lattice(
1297 &self,
1298 text: &str,
1299 options: PinyinReadingOptions,
1300 ) -> Result<Option<Lattice>, CnLatticeError> {
1301 let paths = self
1302 .try_reading_paths_with_stats(text, options)
1303 .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1304 .paths;
1305 if paths.is_empty() {
1306 return Ok(None);
1307 }
1308 pinyin_lattice_from_reading_paths(&paths).map(Some)
1309 }
1310
1311 pub fn hybrid_pinyin_lattice(
1316 &self,
1317 text: &str,
1318 options: PinyinReadingOptions,
1319 ) -> Result<Option<Lattice>, CnLatticeError> {
1320 let paths = self
1321 .try_hybrid_reading_paths_with_stats(text, options)
1322 .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1323 .paths;
1324 if paths.is_empty() {
1325 return Ok(None);
1326 }
1327 pinyin_lattice_from_reading_paths(&paths).map(Some)
1328 }
1329
1330 fn reading_paths_with_stats_inner(
1331 &self,
1332 text: &str,
1333 options: PinyinReadingOptions,
1334 allow_direct_fallback: bool,
1335 ) -> Result<PinyinReadingExpansion, ZhArtifactPayloadError> {
1336 if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1337 return Ok(PinyinReadingExpansion::default());
1338 }
1339
1340 let mut stats = PinyinReadingStats::default();
1341 let boundaries = char_boundaries(text);
1342 let char_len = boundaries.len() - 1;
1343 let mut suffix_paths = vec![Vec::<PinyinReadingPath>::new(); char_len + 1];
1344 suffix_paths[char_len].push(PinyinReadingPath {
1345 segments: Vec::new(),
1346 joined_reading: String::new(),
1347 });
1348
1349 for start in (0..char_len).rev() {
1350 let mut paths_by_reading = BTreeMap::new();
1351 let end_limit = char_len.min(start + options.max_span_chars);
1352 let mut matching_ends = Vec::new();
1353
1354 for end in start + 1..=end_limit {
1355 let surface = &text[boundaries[start]..boundaries[end]];
1356 if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1357 matching_ends.push(end);
1358 }
1359 }
1360 stats.matched_spans += matching_ends.len();
1361
1362 if options.longest_match_only {
1363 if let Some(end) = matching_ends.last().copied() {
1364 stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1365 matching_ends.clear();
1366 matching_ends.push(end);
1367 }
1368 }
1369
1370 for end in matching_ends {
1371 let surface = &text[boundaries[start]..boundaries[end]];
1372 let Some(surface_readings) = self.try_readings(surface)? else {
1373 continue;
1374 };
1375
1376 stats.raw_segment_readings += surface_readings.len();
1377 let raw_surface_reading_count = surface_readings.len();
1378 let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1379 stats.used_segment_readings += surface_readings.len();
1380 stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1381
1382 for surface_reading in surface_readings {
1383 for suffix in &suffix_paths[end] {
1384 stats.candidate_combinations += 1;
1385 let mut reading = String::with_capacity(
1386 surface_reading.len() + suffix.joined_reading.len(),
1387 );
1388 reading.push_str(surface_reading);
1389 reading.push_str(&suffix.joined_reading);
1390
1391 let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1392 segments.push(PinyinReadingSegment {
1393 surface: surface.to_string(),
1394 reading: surface_reading.to_string(),
1395 });
1396 segments.extend(suffix.segments.iter().cloned());
1397
1398 match paths_by_reading.entry(reading.clone()) {
1399 Entry::Vacant(entry) => {
1400 entry.insert(PinyinReadingPath {
1401 segments,
1402 joined_reading: reading,
1403 });
1404 stats.unique_paths += 1;
1405 }
1406 Entry::Occupied(_) => {
1407 stats.duplicate_joined_readings += 1;
1408 }
1409 }
1410
1411 if paths_by_reading.len() >= options.max_paths {
1412 stats.max_paths_hit_count += 1;
1413 break;
1414 }
1415 }
1416
1417 if paths_by_reading.len() >= options.max_paths {
1418 break;
1419 }
1420 }
1421
1422 if paths_by_reading.len() >= options.max_paths {
1423 break;
1424 }
1425 }
1426
1427 if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1428 if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1429 if !suffix_paths[end].is_empty() {
1430 stats.direct_fallback_spans += 1;
1431 let surface = &text[boundaries[start]..boundaries[end]];
1432 let reading = normalize_direct_ascii(surface);
1433 for suffix in &suffix_paths[end] {
1434 stats.candidate_combinations += 1;
1435 let mut joined =
1436 String::with_capacity(reading.len() + suffix.joined_reading.len());
1437 joined.push_str(&reading);
1438 joined.push_str(&suffix.joined_reading);
1439
1440 let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1441 segments.push(PinyinReadingSegment {
1442 surface: surface.to_string(),
1443 reading: reading.clone(),
1444 });
1445 segments.extend(suffix.segments.iter().cloned());
1446
1447 match paths_by_reading.entry(joined.clone()) {
1448 Entry::Vacant(entry) => {
1449 entry.insert(PinyinReadingPath {
1450 segments,
1451 joined_reading: joined,
1452 });
1453 stats.unique_paths += 1;
1454 }
1455 Entry::Occupied(_) => {
1456 stats.duplicate_joined_readings += 1;
1457 }
1458 }
1459
1460 if paths_by_reading.len() >= options.max_paths {
1461 stats.max_paths_hit_count += 1;
1462 break;
1463 }
1464 }
1465 }
1466 }
1467 }
1468
1469 suffix_paths[start] = paths_by_reading.into_values().collect();
1470 }
1471
1472 Ok(PinyinReadingExpansion {
1473 paths: suffix_paths.remove(0),
1474 stats,
1475 })
1476 }
1477}
1478
1479pub fn compare_with_cedict_index(
1481 left: &str,
1482 right: &str,
1483 index: &CedictReadingIndex,
1484 options: PinyinReadingOptions,
1485) -> Result<ChineseDistance, CnLatticeError> {
1486 compare_with_zh_index(left, right, index, options)
1487}
1488
1489pub fn compare_with_zh_index(
1491 left: &str,
1492 right: &str,
1493 index: &ZhReadingIndex,
1494 options: PinyinReadingOptions,
1495) -> Result<ChineseDistance, CnLatticeError> {
1496 let left_lattice = cedict_or_direct_lattice(left, index, options)?;
1497 let right_lattice = cedict_or_direct_lattice(right, index, options)?;
1498 Ok(compare_lattices(left, right, &left_lattice, &right_lattice))
1499}
1500
1501pub fn normalized_similarity_with_zh_index(
1503 left: &str,
1504 right: &str,
1505 index: &ZhReadingIndex,
1506 options: PinyinReadingOptions,
1507) -> Result<f64, CnLatticeError> {
1508 let left_paths = zh_or_direct_pinyin_paths(left, index, options)?;
1509 let right_paths = zh_or_direct_pinyin_paths(right, index, options)?;
1510 Ok(max_normalized_similarity(&left_paths, &right_paths))
1511}
1512
1513pub fn cedict_or_direct_lattice(
1515 input: &str,
1516 index: &CedictReadingIndex,
1517 options: PinyinReadingOptions,
1518) -> Result<Lattice, CnLatticeError> {
1519 zh_or_direct_lattice(input, index, options)
1520}
1521
1522pub fn zh_or_direct_lattice(
1524 input: &str,
1525 index: &ZhReadingIndex,
1526 options: PinyinReadingOptions,
1527) -> Result<Lattice, CnLatticeError> {
1528 if let Some(lattice) = direct_pinyin_lattice(input) {
1529 return Ok(lattice);
1530 }
1531
1532 if let Some(lattice) = index.pinyin_lattice(input, options)? {
1533 return Ok(lattice);
1534 }
1535
1536 if let Some(lattice) = index.hybrid_pinyin_lattice(input, options)? {
1537 return Ok(lattice);
1538 }
1539
1540 direct_pinyin_lattice(input).ok_or_else(|| CnLatticeError::UnsupportedDirectInput {
1541 surface: input.to_string(),
1542 })
1543}
1544
1545pub fn zh_or_direct_pinyin_paths(
1547 input: &str,
1548 index: &ZhReadingIndex,
1549 options: PinyinReadingOptions,
1550) -> Result<Vec<String>, CnLatticeError> {
1551 if can_build_direct_pinyin_path(input) {
1552 return Ok(vec![normalize_direct_ascii(input)]);
1553 }
1554
1555 let paths = index
1556 .try_reading_paths_with_stats(input, options)
1557 .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1558 .paths;
1559 if !paths.is_empty() {
1560 return Ok(paths.into_iter().map(|path| path.joined_reading).collect());
1561 }
1562
1563 let paths = index
1564 .try_hybrid_reading_paths_with_stats(input, options)
1565 .map_err(|err| CnLatticeError::ArtifactPayload(err.to_string()))?
1566 .paths;
1567 if !paths.is_empty() {
1568 return Ok(paths.into_iter().map(|path| path.joined_reading).collect());
1569 }
1570
1571 Err(CnLatticeError::UnsupportedDirectInput {
1572 surface: input.to_string(),
1573 })
1574}
1575
1576fn max_normalized_similarity(left_paths: &[String], right_paths: &[String]) -> f64 {
1577 left_paths
1578 .iter()
1579 .flat_map(|left| {
1580 right_paths
1581 .iter()
1582 .map(move |right| normalized_similarity_str(left, right))
1583 })
1584 .fold(0.0, f64::max)
1585}
1586
1587pub fn pinyin_lattice_from_reading_paths(
1593 paths: &[PinyinReadingPath],
1594) -> Result<Lattice, CnLatticeError> {
1595 if paths.is_empty() {
1596 return Err(CnLatticeError::EmptyReadings);
1597 }
1598
1599 Ok(Lattice::from_symbol_paths_compact(paths.iter().map(
1600 |path| {
1601 path.joined_reading
1602 .chars()
1603 .map(|ch| ch as moine_core::Symbol)
1604 .collect::<Vec<_>>()
1605 },
1606 )))
1607}
1608
1609pub fn normalize_pinyin(raw: &str, view: PinyinView) -> String {
1615 let mut normalized = String::new();
1616 for token in raw.split_whitespace() {
1617 normalized.push_str(&normalize_pinyin_token(token, view));
1618 }
1619 match view {
1620 PinyinView::NoTone => strip_no_tone_digits(&normalized),
1621 PinyinView::Tone3 => normalized,
1622 }
1623}
1624
1625fn compare_lattices(
1626 left: &str,
1627 right: &str,
1628 left_lattice: &Lattice,
1629 right_lattice: &Lattice,
1630) -> ChineseDistance {
1631 let lattice = distance(left_lattice, right_lattice);
1632 let lattice_damerau = damerau_distance(left_lattice, right_lattice);
1633 let surface_levenshtein = levenshtein_str(left, right);
1634 let surface_damerau = damerau_levenshtein_str(left, right);
1635
1636 ChineseDistance {
1637 surface_levenshtein,
1638 surface_damerau,
1639 lattice,
1640 lattice_damerau,
1641 combined: surface_damerau.min(lattice),
1642 }
1643}
1644
1645fn direct_pinyin_lattice(input: &str) -> Option<Lattice> {
1646 if input.is_empty() || !can_build_direct_pinyin_path(input) {
1647 return None;
1648 }
1649 Some(Lattice::from_paths([normalize_direct_ascii(input)]))
1650}
1651
1652fn normalize_pinyin_token(token: &str, view: PinyinView) -> String {
1653 let lowered = token.to_lowercase().replace("u:", "v").replace('ü', "v");
1654 let contains_letters = lowered.chars().any(|ch| ch.is_ascii_alphabetic());
1655 if view == PinyinView::NoTone && contains_letters {
1656 lowered
1657 .chars()
1658 .filter(|ch| !matches!(ch, '1'..='5'))
1659 .collect()
1660 } else {
1661 lowered
1662 }
1663}
1664
1665fn normalize_direct_ascii(input: &str) -> String {
1666 input.to_lowercase().replace("u:", "v")
1667}
1668
1669fn normalize_artifact_reading(reading: &str, view: PinyinView) -> String {
1670 let lowered = reading
1671 .to_lowercase()
1672 .replace("u:", "v")
1673 .replace('ü', "v")
1674 .chars()
1675 .filter(|ch| !ch.is_whitespace())
1676 .collect::<String>();
1677 match view {
1678 PinyinView::NoTone => strip_no_tone_digits(&lowered),
1679 PinyinView::Tone3 => lowered,
1680 }
1681}
1682
1683fn strip_no_tone_digits(reading: &str) -> String {
1684 let mut previous = None;
1685 let mut normalized = String::with_capacity(reading.len());
1686 for ch in reading.chars() {
1687 if matches!(ch, '1'..='5') && previous.is_some_and(|prev: char| prev.is_ascii_alphabetic())
1688 {
1689 continue;
1690 }
1691 normalized.push(ch);
1692 previous = Some(ch);
1693 }
1694 normalized
1695}
1696
1697fn char_boundaries(text: &str) -> Vec<usize> {
1698 text.char_indices()
1699 .map(|(index, _)| index)
1700 .chain(std::iter::once(text.len()))
1701 .collect()
1702}
1703
1704fn limited_surface_readings(readings: &[String], options: PinyinReadingOptions) -> &[String] {
1705 if let Some(max_readings) = options.max_readings_per_segment {
1706 &readings[..readings.len().min(max_readings)]
1707 } else {
1708 readings
1709 }
1710}
1711
1712fn direct_fallback_end(
1713 text: &str,
1714 boundaries: &[usize],
1715 start: usize,
1716 char_len: usize,
1717) -> Option<usize> {
1718 let mut end = start;
1719 while end < char_len {
1720 let surface = &text[boundaries[start]..boundaries[end + 1]];
1721 if !can_build_direct_pinyin_path(surface) {
1722 break;
1723 }
1724 end += 1;
1725 }
1726
1727 (end > start).then_some(end)
1728}
1729
1730fn pinyin_view_header_value(view: PinyinView) -> u32 {
1731 match view {
1732 PinyinView::NoTone => 0,
1733 PinyinView::Tone3 => 1,
1734 }
1735}
1736
1737fn pinyin_view_from_header_value(value: u32) -> Result<PinyinView, ZhArtifactPayloadError> {
1738 match value {
1739 0 => Ok(PinyinView::NoTone),
1740 1 => Ok(PinyinView::Tone3),
1741 _ => Err(ZhArtifactPayloadError::UnsupportedIndexedPinyinView { value }),
1742 }
1743}
1744
1745fn write_binary_string(
1746 writer: &mut impl Write,
1747 field: &'static str,
1748 value: &str,
1749) -> Result<(), ZhArtifactPayloadError> {
1750 write_u32_len(writer, field, value.len())?;
1751 writer.write_all(value.as_bytes())?;
1752 Ok(())
1753}
1754
1755fn write_u32_len(
1756 writer: &mut impl Write,
1757 field: &'static str,
1758 len: usize,
1759) -> Result<(), ZhArtifactPayloadError> {
1760 let len = u32::try_from(len).map_err(|_| ZhArtifactPayloadError::IndexedSectionTooLarge {
1761 field,
1762 len: len as u64,
1763 })?;
1764 writer.write_all(&len.to_le_bytes())?;
1765 Ok(())
1766}
1767
1768fn read_indexed_artifact_payload_header_bytes(
1769 bytes: &[u8],
1770) -> Result<ZhIndexedArtifactPayloadHeader, ZhArtifactPayloadError> {
1771 if bytes.len() < INDEXED_ARTIFACT_HEADER_LEN {
1772 return Err(ZhArtifactPayloadError::TruncatedIndexed { field: "header" });
1773 }
1774 let mut magic = [0_u8; 8];
1775 magic.copy_from_slice(&bytes[..8]);
1776 if &magic != INDEXED_ARTIFACT_MAGIC {
1777 return Err(ZhArtifactPayloadError::InvalidIndexedMagic { magic });
1778 }
1779
1780 let version = read_u32_le_bytes(bytes, 8, "version")?;
1781 if version != INDEXED_ARTIFACT_VERSION {
1782 return Err(ZhArtifactPayloadError::UnsupportedIndexedVersion { version });
1783 }
1784 let pinyin_view = pinyin_view_from_header_value(read_u32_le_bytes(bytes, 12, "pinyin_view")?)?;
1785 let entry_count = read_u64_le_bytes(bytes, 16, "entry_count")?;
1786 let fst_len = read_u64_le_bytes(bytes, 24, "fst_len")?;
1787 let readings_len = read_u64_le_bytes(bytes, 32, "readings_len")?;
1788 let entries = checked_indexed_usize("entry_count", entry_count)?;
1789 check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1790 Ok(ZhIndexedArtifactPayloadHeader {
1791 version,
1792 pinyin_view,
1793 entries,
1794 fst_len: checked_indexed_usize("fst_len", fst_len)?,
1795 readings_len: checked_indexed_usize("readings_len", readings_len)?,
1796 })
1797}
1798
1799fn read_u32_le_bytes(
1800 bytes: &[u8],
1801 offset: usize,
1802 field: &'static str,
1803) -> Result<u32, ZhArtifactPayloadError> {
1804 let end = offset
1805 .checked_add(4)
1806 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1807 let chunk = bytes
1808 .get(offset..end)
1809 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1810 Ok(u32::from_le_bytes(
1811 chunk.try_into().expect("slice length is 4"),
1812 ))
1813}
1814
1815fn read_u64_le_bytes(
1816 bytes: &[u8],
1817 offset: usize,
1818 field: &'static str,
1819) -> Result<u64, ZhArtifactPayloadError> {
1820 let end = offset
1821 .checked_add(8)
1822 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1823 let chunk = bytes
1824 .get(offset..end)
1825 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field })?;
1826 Ok(u64::from_le_bytes(
1827 chunk.try_into().expect("slice length is 8"),
1828 ))
1829}
1830
1831fn checked_indexed_usize(field: &'static str, len: u64) -> Result<usize, ZhArtifactPayloadError> {
1832 usize::try_from(len).map_err(|_| ZhArtifactPayloadError::IndexedSectionTooLarge { field, len })
1833}
1834
1835fn check_payload_file_size(path: &Path) -> Result<(), ZhArtifactPayloadError> {
1836 let len = std::fs::metadata(path)?.len();
1837 if len > MAX_ARTIFACT_PAYLOAD_BYTES {
1838 return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
1839 field: "payload_bytes",
1840 len,
1841 max: MAX_ARTIFACT_PAYLOAD_BYTES,
1842 });
1843 }
1844 Ok(())
1845}
1846
1847fn check_limit(field: &'static str, len: usize, max: usize) -> Result<(), ZhArtifactPayloadError> {
1848 if len > max {
1849 return Err(ZhArtifactPayloadError::ArtifactLimitExceeded {
1850 field,
1851 len: len as u64,
1852 max: max as u64,
1853 });
1854 }
1855 Ok(())
1856}
1857
1858fn write_indexed_reading_block(
1859 writer: &mut Vec<u8>,
1860 readings: &[String],
1861) -> Result<(), ZhArtifactPayloadError> {
1862 write_u32_len(writer, "reading_count", readings.len())?;
1863 for reading in readings {
1864 write_binary_string(writer, "reading", reading)?;
1865 }
1866 Ok(())
1867}
1868
1869impl IndexedZhPayload {
1870 fn validate(&self, pinyin_view: PinyinView) -> Result<(), ZhArtifactPayloadError> {
1871 let mut stream = self.map.stream();
1872 while let Some((surface, offset)) = stream.next() {
1873 let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
1874 ZhArtifactPayloadError::InvalidIndexedUtf8 {
1875 field: "surface",
1876 source,
1877 }
1878 })?;
1879 if surface.is_empty() {
1880 return Err(ZhArtifactPayloadError::EmptySurface { entry_index: 0 });
1881 }
1882 let readings = self.readings_at(offset)?;
1883 if readings.is_empty() {
1884 return Err(ZhArtifactPayloadError::EmptyReadings { surface });
1885 }
1886 let mut seen = BTreeSet::new();
1887 for (reading_index, reading) in readings.iter().enumerate() {
1888 if reading.is_empty() {
1889 return Err(ZhArtifactPayloadError::EmptyReading {
1890 surface: surface.clone(),
1891 reading_index,
1892 });
1893 }
1894 let normalized = normalize_artifact_reading(reading, pinyin_view);
1895 if normalized != *reading {
1896 return Err(ZhArtifactPayloadError::ReadingNotNormalized {
1897 surface: surface.clone(),
1898 reading: reading.clone(),
1899 normalized,
1900 });
1901 }
1902 if !seen.insert(reading) {
1903 return Err(ZhArtifactPayloadError::DuplicateReading {
1904 surface: surface.clone(),
1905 reading: reading.clone(),
1906 });
1907 }
1908 }
1909 }
1910 Ok(())
1911 }
1912
1913 fn readings(&self, surface: &str) -> Result<Option<Vec<String>>, ZhArtifactPayloadError> {
1914 self.map
1915 .get(surface)
1916 .map(|offset| self.readings_at(offset))
1917 .transpose()
1918 }
1919
1920 fn entries(&self) -> Result<Vec<ZhReadingIndexPayloadEntry>, ZhArtifactPayloadError> {
1921 let mut entries = Vec::with_capacity(self.entries);
1922 let mut stream = self.map.stream();
1923 while let Some((surface, offset)) = stream.next() {
1924 let surface = String::from_utf8(surface.to_vec()).map_err(|source| {
1925 ZhArtifactPayloadError::InvalidIndexedUtf8 {
1926 field: "surface",
1927 source,
1928 }
1929 })?;
1930 let readings = self.readings_at(offset)?;
1931 entries.push(ZhReadingIndexPayloadEntry { surface, readings });
1932 }
1933 Ok(entries)
1934 }
1935
1936 fn readings_at(&self, offset: u64) -> Result<Vec<String>, ZhArtifactPayloadError> {
1937 read_indexed_readings_at_bytes(&self.mmap, self.readings_start, offset)
1938 }
1939}
1940
1941fn read_indexed_readings_at_bytes(
1942 bytes: &[u8],
1943 readings_start: usize,
1944 offset: u64,
1945) -> Result<Vec<String>, ZhArtifactPayloadError> {
1946 let offset = usize::try_from(offset)
1947 .map_err(|_| ZhArtifactPayloadError::InvalidIndexedOffset { offset })?;
1948 let start =
1949 readings_start
1950 .checked_add(offset)
1951 .ok_or(ZhArtifactPayloadError::InvalidIndexedOffset {
1952 offset: offset as u64,
1953 })?;
1954 if start >= bytes.len() {
1955 return Err(ZhArtifactPayloadError::InvalidIndexedOffset {
1956 offset: offset as u64,
1957 });
1958 }
1959 let mut cursor = start;
1960 let reading_count = read_u32_le_bytes(bytes, cursor, "reading_count")? as usize;
1961 check_limit(
1962 "reading_count",
1963 reading_count,
1964 MAX_ARTIFACT_READINGS_PER_ENTRY,
1965 )?;
1966 cursor += 4;
1967 let mut readings = Vec::with_capacity(reading_count);
1968 for _ in 0..reading_count {
1969 let len = read_u32_le_bytes(bytes, cursor, "reading_len")? as usize;
1970 check_limit("reading_bytes", len, MAX_ARTIFACT_STRING_BYTES)?;
1971 cursor += 4;
1972 let end = cursor
1973 .checked_add(len)
1974 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
1975 let reading_bytes = bytes
1976 .get(cursor..end)
1977 .ok_or(ZhArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
1978 let reading = String::from_utf8(reading_bytes.to_vec()).map_err(|source| {
1979 ZhArtifactPayloadError::InvalidIndexedUtf8 {
1980 field: "reading",
1981 source,
1982 }
1983 })?;
1984 readings.push(reading);
1985 cursor = end;
1986 }
1987 Ok(readings)
1988}
1989
1990fn can_build_direct_pinyin_path(surface: &str) -> bool {
1991 !surface.is_empty() && surface.is_ascii()
1992}
1993
1994pub fn artifact_file_digest_path(path: impl AsRef<Path>) -> Result<String, std::io::Error> {
1996 let file = File::open(path)?;
1997 artifact_file_digest_reader(file)
1998}
1999
2000pub fn artifact_file_digest_reader(mut reader: impl Read) -> Result<String, std::io::Error> {
2002 let mut hasher = Sha256::new();
2003 let mut buffer = [0_u8; 64 * 1024];
2004 loop {
2005 let read = reader.read(&mut buffer)?;
2006 if read == 0 {
2007 break;
2008 }
2009 hasher.update(&buffer[..read]);
2010 }
2011 Ok(sha256_digest_hex(hasher.finalize()))
2012}
2013
2014fn validate_artifact_payload_header(
2015 payload: &ZhReadingIndexPayload,
2016) -> Result<(), ZhArtifactPayloadError> {
2017 if payload.schema_version != ARTIFACT_PAYLOAD_SCHEMA_VERSION {
2018 return Err(ZhArtifactPayloadError::UnsupportedSchemaVersion {
2019 version: payload.schema_version,
2020 });
2021 }
2022 if payload.payload_type != ARTIFACT_PAYLOAD_TYPE {
2023 return Err(ZhArtifactPayloadError::UnsupportedPayloadType {
2024 payload_type: payload.payload_type.clone(),
2025 });
2026 }
2027 Ok(())
2028}
2029
2030fn canonical_payload_bytes(payload: &ZhReadingIndexPayload) -> Vec<u8> {
2031 let mut bytes = Vec::new();
2032 bytes.extend_from_slice(b"moine.zh.reading-index.surface-readings/v1\n");
2033 push_len_prefixed(&mut bytes, b"V", &payload.pinyin_view);
2034 for entry in &payload.entries {
2035 push_len_prefixed(&mut bytes, b"S", &entry.surface);
2036 bytes.extend_from_slice(format!("R{}\n", entry.readings.len()).as_bytes());
2037 for reading in &entry.readings {
2038 push_len_prefixed(&mut bytes, b"r", reading);
2039 }
2040 }
2041 bytes
2042}
2043
2044fn push_len_prefixed(bytes: &mut Vec<u8>, tag: &[u8], value: &str) {
2045 bytes.extend_from_slice(tag);
2046 bytes.extend_from_slice(value.len().to_string().as_bytes());
2047 bytes.push(b'\n');
2048 bytes.extend_from_slice(value.as_bytes());
2049 bytes.push(b'\n');
2050}
2051
2052fn sha256_hex(bytes: &[u8]) -> String {
2053 sha256_digest_hex(Sha256::digest(bytes))
2054}
2055
2056fn sha256_digest_hex(digest: impl IntoIterator<Item = u8>) -> String {
2057 let mut output = String::with_capacity(64);
2058 for byte in digest {
2059 write!(&mut output, "{byte:02x}").expect("writing to String should not fail");
2060 }
2061 output
2062}
2063
2064struct CedictEntry<'a> {
2065 traditional: &'a str,
2066 simplified: &'a str,
2067 pinyin: &'a str,
2068}
2069
2070fn parse_cedict_entry(line: &str, line_number: usize) -> Result<CedictEntry<'_>, CedictError> {
2071 let (traditional, rest) = take_token(line)
2072 .ok_or_else(|| invalid_entry(line_number, "missing traditional surface"))?;
2073 let (simplified, rest) = take_token(rest.trim_start())
2074 .ok_or_else(|| invalid_entry(line_number, "missing simplified surface"))?;
2075 let rest = rest.trim_start();
2076
2077 let (pinyin, rest) = if let Some(after_open) = rest.strip_prefix("[[") {
2078 let Some(end) = after_open.find("]]") else {
2079 return Err(invalid_entry(line_number, "missing closing ]] for pinyin"));
2080 };
2081 (&after_open[..end], &after_open[end + 2..])
2082 } else if let Some(after_open) = rest.strip_prefix('[') {
2083 let Some(end) = after_open.find(']') else {
2084 return Err(invalid_entry(line_number, "missing closing ] for pinyin"));
2085 };
2086 (&after_open[..end], &after_open[end + 1..])
2087 } else {
2088 return Err(invalid_entry(line_number, "missing pinyin bracket"));
2089 };
2090
2091 if pinyin.is_empty() {
2092 return Err(invalid_entry(line_number, "empty pinyin field"));
2093 }
2094 if !rest.trim_start().starts_with('/') {
2095 return Err(invalid_entry(line_number, "missing definition slash"));
2096 }
2097
2098 Ok(CedictEntry {
2099 traditional,
2100 simplified,
2101 pinyin,
2102 })
2103}
2104
2105fn invalid_entry(line: usize, message: impl Into<String>) -> CedictError {
2106 CedictError::InvalidEntry {
2107 line,
2108 message: message.into(),
2109 }
2110}
2111
2112fn take_token(input: &str) -> Option<(&str, &str)> {
2113 let input = input.trim_start();
2114 if input.is_empty() {
2115 return None;
2116 }
2117 for (index, ch) in input.char_indices() {
2118 if ch.is_whitespace() {
2119 return Some((&input[..index], &input[index..]));
2120 }
2121 }
2122 Some((input, ""))
2123}
2124
2125#[cfg(test)]
2126mod tests {
2127 use super::*;
2128
2129 #[test]
2130 fn normalizes_pinyin_views() {
2131 assert_eq!(
2132 normalize_pinyin("Wei1 shi4 ji4", PinyinView::NoTone),
2133 "weishiji"
2134 );
2135 assert_eq!(
2136 normalize_pinyin("Wei1 shi4 ji4", PinyinView::Tone3),
2137 "wei1shi4ji4"
2138 );
2139 assert_eq!(normalize_pinyin("nu:3 er2", PinyinView::NoTone), "nver");
2140 assert_eq!(normalize_pinyin("nu:3 er2", PinyinView::Tone3), "nv3er2");
2141 assert_eq!(normalize_pinyin("hua1 r5", PinyinView::NoTone), "huar");
2142 assert_eq!(normalize_pinyin("11 Qu1", PinyinView::NoTone), "11qu");
2143 assert_eq!(normalize_pinyin("Shuang1 11", PinyinView::NoTone), "shuang");
2144 assert_eq!(
2145 normalize_pinyin("D N A jian4 ding4", PinyinView::NoTone),
2146 "dnajianding"
2147 );
2148 }
2149
2150 #[test]
2151 fn builds_no_tone_index_from_cedict() {
2152 let cedict = "\
2153# CC-CEDICT
2154威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2155布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2156女兒 女儿 [nu:3 er2] /daughter/
2157";
2158 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2159
2160 assert_eq!(index.pinyin_view(), PinyinView::NoTone);
2161 assert_eq!(
2162 index.readings("威士忌").as_deref(),
2163 Some(&["weishiji".to_string()][..])
2164 );
2165 assert_eq!(
2166 index.readings("布纳哈本").as_deref(),
2167 Some(&["bunahaben".to_string()][..])
2168 );
2169 assert_eq!(
2170 index.readings("女儿").as_deref(),
2171 Some(&["nver".to_string()][..])
2172 );
2173 }
2174
2175 #[test]
2176 fn builds_tone3_index_when_requested() {
2177 let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2178 let index = CedictReadingIndex::from_cedict_reader_with_options(
2179 cedict.as_bytes(),
2180 CedictIndexOptions {
2181 pinyin_view: PinyinView::Tone3,
2182 ..CedictIndexOptions::default()
2183 },
2184 )
2185 .unwrap();
2186
2187 assert_eq!(index.pinyin_view(), PinyinView::Tone3);
2188 assert_eq!(
2189 index.readings("威士忌").as_deref(),
2190 Some(&["wei1shi4ji4".to_string()][..])
2191 );
2192 }
2193
2194 #[test]
2195 fn deduplicates_after_normalization() {
2196 let cedict = "\
2197樂 乐 [Le4] /surname Le/
2198樂 乐 [le4] /happy/
2199樂 乐 [Yue4] /surname Yue/
2200";
2201 let no_tone = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2202 let tone3 = CedictReadingIndex::from_cedict_reader_with_options(
2203 cedict.as_bytes(),
2204 CedictIndexOptions {
2205 pinyin_view: PinyinView::Tone3,
2206 ..CedictIndexOptions::default()
2207 },
2208 )
2209 .unwrap();
2210
2211 assert_eq!(
2212 no_tone.readings("乐").as_deref(),
2213 Some(&["le".to_string(), "yue".to_string()][..])
2214 );
2215 assert_eq!(
2216 tone3.readings("乐").as_deref(),
2217 Some(&["le4".to_string(), "yue4".to_string()][..])
2218 );
2219 }
2220
2221 #[test]
2222 fn rejects_malformed_entries() {
2223 let err = CedictReadingIndex::from_cedict_reader(
2224 "威士忌 威士忌 Wei1 shi4 ji4 /whisky/\n".as_bytes(),
2225 )
2226 .unwrap_err();
2227
2228 assert!(matches!(err, CedictError::InvalidEntry { line: 1, .. }));
2229 }
2230
2231 #[test]
2232 fn computes_dictionary_paths_and_stats() {
2233 let cedict = "\
2234威 威 [wei1] /power/
2235士忌 士忌 [shi4 ji4] /whisky transcription tail/
2236威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2237";
2238 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2239 let expansion = index.reading_paths_with_stats(
2240 "威士忌",
2241 PinyinReadingOptions {
2242 longest_match_only: true,
2243 ..PinyinReadingOptions::default()
2244 },
2245 );
2246
2247 assert_eq!(expansion.paths.len(), 1);
2248 assert_eq!(expansion.paths[0].joined_reading, "weishiji");
2249 assert_eq!(
2250 expansion.paths[0].segments,
2251 vec![PinyinReadingSegment {
2252 surface: "威士忌".to_string(),
2253 reading: "weishiji".to_string(),
2254 }]
2255 );
2256 assert_eq!(expansion.stats.longest_match_pruned_spans, 1);
2257 }
2258
2259 #[test]
2260 fn hybrid_paths_allow_ascii_prefix_and_dictionary_tail() {
2261 let cedict = "忌 忌 [ji4] /whisky transcription character/\n";
2262 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2263 let paths = index.hybrid_reading_paths("weishi忌", PinyinReadingOptions::default());
2264
2265 assert_eq!(paths.len(), 1);
2266 assert_eq!(paths[0].joined_reading, "weishiji");
2267 }
2268
2269 #[test]
2270 fn compare_matches_pinyin_input_to_chinese_surface() {
2271 let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2272 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2273 let distances = compare_with_cedict_index(
2274 "weishiji",
2275 "威士忌",
2276 &index,
2277 PinyinReadingOptions::default(),
2278 )
2279 .unwrap();
2280
2281 assert_eq!(distances.lattice, 0);
2282 assert_eq!(distances.lattice_damerau, 0);
2283 assert!(distances.surface_damerau > distances.lattice);
2284 }
2285
2286 #[test]
2287 fn lattice_damerau_counts_adjacent_pinyin_transposition() {
2288 let distances = compare_with_cedict_index(
2289 "weishiji",
2290 "wieshiji",
2291 &CedictReadingIndex::default(),
2292 PinyinReadingOptions::default(),
2293 )
2294 .unwrap();
2295
2296 assert_eq!(distances.lattice, 2);
2297 assert_eq!(distances.lattice_damerau, 1);
2298 }
2299
2300 #[test]
2301 fn normalized_similarity_matches_pinyin_input_to_chinese_surface() {
2302 let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2303 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2304 let similarity = normalized_similarity_with_zh_index(
2305 "weishiji",
2306 "威士忌",
2307 &index,
2308 PinyinReadingOptions::default(),
2309 )
2310 .unwrap();
2311
2312 assert_eq!(similarity, 1.0);
2313 }
2314
2315 #[test]
2316 fn emits_and_loads_artifact_payload() {
2317 let cedict = "\
2318威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2319布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2320";
2321 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2322 let payload = index.artifact_payload();
2323 let loaded = ZhReadingIndex::from_artifact_payload(payload).unwrap();
2324
2325 assert_eq!(loaded.pinyin_view(), PinyinView::NoTone);
2326 assert_eq!(
2327 loaded.readings("威士忌").as_deref(),
2328 Some(&["weishiji".to_string()][..])
2329 );
2330 assert_eq!(
2331 loaded.readings("布纳哈本").as_deref(),
2332 Some(&["bunahaben".to_string()][..])
2333 );
2334 assert_eq!(
2335 loaded.artifact_payload_checksum(),
2336 index.artifact_payload_checksum()
2337 );
2338 }
2339
2340 #[test]
2341 fn indexed_artifact_payload_round_trips_and_supports_lookup() {
2342 let cedict = "\
2343威士忌 威士忌 [Wei1 shi4 ji4] /whisky/
2344布納哈本 布纳哈本 [Bu4 na4 ha1 ben3] /Bunnahabhain/
2345";
2346 let index = CedictReadingIndex::from_cedict_reader(cedict.as_bytes()).unwrap();
2347 let mut bytes = Vec::new();
2348 index.write_indexed_artifact_payload(&mut bytes).unwrap();
2349 let path = std::env::temp_dir().join(format!(
2350 "moine-zh-indexed-test-{}-{}.moineidx",
2351 std::process::id(),
2352 std::time::SystemTime::now()
2353 .duration_since(std::time::UNIX_EPOCH)
2354 .unwrap()
2355 .as_nanos()
2356 ));
2357 std::fs::write(&path, &bytes).unwrap();
2358 let loaded = ZhReadingIndex::from_indexed_artifact_payload_path(&path).unwrap();
2359 std::fs::remove_file(&path).unwrap();
2360 let loaded_from_bytes = ZhReadingIndex::from_indexed_artifact_payload_bytes(&bytes)
2361 .expect("indexed payload bytes should load");
2362
2363 assert_eq!(loaded.pinyin_view(), PinyinView::NoTone);
2364 assert_eq!(
2365 loaded.readings("威士忌").as_deref(),
2366 Some(&["weishiji".to_string()][..])
2367 );
2368 assert_eq!(
2369 loaded_from_bytes.artifact_payload(),
2370 index.artifact_payload()
2371 );
2372 assert_eq!(
2373 loaded.readings("布纳哈本").as_deref(),
2374 Some(&["bunahaben".to_string()][..])
2375 );
2376 assert_eq!(
2377 loaded.artifact_payload_checksum(),
2378 index.artifact_payload_checksum()
2379 );
2380 }
2381
2382 #[test]
2383 fn artifact_metadata_records_build_and_license() {
2384 let cedict = "威士忌 威士忌 [Wei1 shi4 ji4] /whisky/\n";
2385 let options = CedictIndexOptions {
2386 pinyin_view: PinyinView::Tone3,
2387 max_readings_per_surface: Some(4),
2388 };
2389 let index = CedictReadingIndex::from_cedict_reader_with_options(cedict.as_bytes(), options)
2390 .unwrap();
2391 let metadata = index.artifact_metadata(ZhArtifactMetadataOptions {
2392 artifact_name: "moine-cedict-test".to_string(),
2393 generator: "test".to_string(),
2394 payload_file_name: "payload.yaml".to_string(),
2395 payload_format: "yaml.surface-readings.v1".to_string(),
2396 source_name: "CC-CEDICT".to_string(),
2397 source_version: "2026-05-20".to_string(),
2398 source_cedict: "cedict.txt".to_string(),
2399 index_options: options,
2400 query_defaults: PinyinReadingOptions {
2401 longest_match_only: true,
2402 ..PinyinReadingOptions::default()
2403 },
2404 license: ZhArtifactLicense::default(),
2405 });
2406
2407 assert_eq!(metadata.artifact_type, "moine.zh.reading-index");
2408 assert_eq!(metadata.build.pinyin_view, "tone3");
2409 assert_eq!(metadata.build.max_readings_per_surface, Some(4));
2410 assert!(metadata.query_defaults.longest_match_only);
2411 assert_eq!(metadata.license.selected_license, "CC BY-SA 4.0");
2412 }
2413
2414 #[test]
2415 fn rejects_duplicate_artifact_surface() {
2416 let payload = ZhReadingIndexPayload {
2417 schema_version: 1,
2418 payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2419 pinyin_view: "no-tone".to_string(),
2420 entries: vec![
2421 ZhReadingIndexPayloadEntry {
2422 surface: "威士忌".to_string(),
2423 readings: vec!["weishiji".to_string()],
2424 },
2425 ZhReadingIndexPayloadEntry {
2426 surface: "威士忌".to_string(),
2427 readings: vec!["weishiji".to_string()],
2428 },
2429 ],
2430 };
2431 let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2432
2433 assert!(matches!(
2434 err,
2435 ZhArtifactPayloadError::DuplicateSurface { .. }
2436 ));
2437 }
2438
2439 #[test]
2440 fn rejects_artifact_payload_excessive_reading_count() {
2441 let payload = ZhReadingIndexPayload {
2442 schema_version: 1,
2443 payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2444 pinyin_view: "no-tone".to_string(),
2445 entries: vec![ZhReadingIndexPayloadEntry {
2446 surface: "威士忌".to_string(),
2447 readings: vec!["weishiji".to_string(); MAX_ARTIFACT_READINGS_PER_ENTRY + 1],
2448 }],
2449 };
2450 let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2451
2452 assert!(matches!(
2453 err,
2454 ZhArtifactPayloadError::ArtifactLimitExceeded {
2455 field: "reading_count",
2456 ..
2457 }
2458 ));
2459 }
2460
2461 #[test]
2462 fn rejects_non_normalized_artifact_reading() {
2463 let payload = ZhReadingIndexPayload {
2464 schema_version: 1,
2465 payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2466 pinyin_view: "no-tone".to_string(),
2467 entries: vec![ZhReadingIndexPayloadEntry {
2468 surface: "威士忌".to_string(),
2469 readings: vec!["Wei1shi4ji4".to_string()],
2470 }],
2471 };
2472 let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2473
2474 assert!(matches!(
2475 err,
2476 ZhArtifactPayloadError::ReadingNotNormalized { .. }
2477 ));
2478 }
2479
2480 #[test]
2481 fn no_tone_artifact_rejects_tone_digits_after_letters() {
2482 let payload = ZhReadingIndexPayload {
2483 schema_version: 1,
2484 payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2485 pinyin_view: "no-tone".to_string(),
2486 entries: vec![ZhReadingIndexPayloadEntry {
2487 surface: "威士忌".to_string(),
2488 readings: vec!["wei1shi4ji4".to_string()],
2489 }],
2490 };
2491 let err = ZhReadingIndex::from_artifact_payload(payload).unwrap_err();
2492
2493 assert!(matches!(
2494 err,
2495 ZhArtifactPayloadError::ReadingNotNormalized { .. }
2496 ));
2497 }
2498
2499 #[test]
2500 fn artifact_validation_keeps_numeric_tokens_in_no_tone_view() {
2501 let payload = ZhReadingIndexPayload {
2502 schema_version: 1,
2503 payload_type: "moine.zh.reading-index.surface-readings".to_string(),
2504 pinyin_view: "no-tone".to_string(),
2505 entries: vec![ZhReadingIndexPayloadEntry {
2506 surface: "11区".to_string(),
2507 readings: vec!["11qu".to_string()],
2508 }],
2509 };
2510 let index = ZhReadingIndex::from_artifact_payload(payload).unwrap();
2511
2512 assert_eq!(
2513 index.readings("11区").as_deref(),
2514 Some(&["11qu".to_string()][..])
2515 );
2516 }
2517
2518 #[test]
2519 fn tone3_view_preserves_tone_digits() {
2520 let cedict = "重 重 [chong2] /again/\n重 重 [zhong4] /heavy/\n";
2521 let index = CedictReadingIndex::from_cedict_reader_with_options(
2522 cedict.as_bytes(),
2523 CedictIndexOptions {
2524 pinyin_view: PinyinView::Tone3,
2525 ..CedictIndexOptions::default()
2526 },
2527 )
2528 .unwrap();
2529 let distances =
2530 compare_with_cedict_index("zhong4", "重", &index, PinyinReadingOptions::default())
2531 .unwrap();
2532
2533 assert_eq!(distances.lattice, 0);
2534 }
2535
2536 #[test]
2537 fn unknown_han_without_dictionary_path_is_rejected() {
2538 let index = CedictReadingIndex::default();
2539 let err =
2540 cedict_or_direct_lattice("印", &index, PinyinReadingOptions::default()).unwrap_err();
2541
2542 assert!(matches!(err, CnLatticeError::UnsupportedDirectInput { .. }));
2543 }
2544}