1use std::borrow::Cow;
2use std::collections::{btree_map::Entry, BTreeSet, HashMap};
3use std::error::Error;
4use std::fmt;
5use std::fmt::Write as _;
6use std::fs::File;
7use std::io::{Read, Write};
8use std::path::Path;
9use std::string::FromUtf8Error;
10use std::sync::Arc;
11
12use fst::{Map, MapBuilder, Streamer};
13use memmap2::Mmap;
14use moine_core::Lattice;
15use serde::{Deserialize, Serialize};
16use sha2::{Digest, Sha256};
17
18use crate::romaji::{
19 can_build_romaji_paths, romaji_paths_from_reading_segments,
20 romaji_symbol_paths_from_reading_segments, JaLatticeError,
21};
22
23const SURFACE_COLUMN: usize = 0;
24const POS1_COLUMN: usize = 4;
25const LFORM_COLUMN: usize = 10;
26const PRON_COLUMN: usize = 13;
27const ARTIFACT_PAYLOAD_SCHEMA_VERSION: u32 = 1;
28const ARTIFACT_PAYLOAD_TYPE: &str = "moine.unidic.reading-index.surface-readings";
29const BINARY_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEU01";
30const BINARY_ARTIFACT_VERSION: u32 = 1;
31const INDEXED_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEI01";
32const INDEXED_ARTIFACT_VERSION: u32 = 1;
33const INDEXED_ARTIFACT_HEADER_LEN: usize = 40;
34const MAX_ARTIFACT_PAYLOAD_BYTES: u64 = 512 * 1024 * 1024;
35const MAX_ARTIFACT_ENTRIES: usize = 2_000_000;
36const MAX_ARTIFACT_READINGS_PER_ENTRY: usize = 256;
37const MAX_ARTIFACT_STRING_BYTES: usize = 16 * 1024;
38pub const ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "sha256-canonical-v1";
40pub const LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "fnv1a64-canonical-v1";
42pub const ARTIFACT_PAYLOAD_FILE_DIGEST_ALGORITHM: &str = "sha256-file-v1";
44
45#[derive(Clone, Debug)]
47pub struct UnidicReadingIndex {
48 storage: UnidicReadingStorage,
49}
50
51#[derive(Clone, Debug)]
52enum UnidicReadingStorage {
53 Eager(HashMap<String, Vec<String>>),
54 Indexed(IndexedUnidicPayload),
55}
56
57impl Default for UnidicReadingIndex {
58 fn default() -> Self {
59 Self {
60 storage: UnidicReadingStorage::Eager(HashMap::new()),
61 }
62 }
63}
64
65impl PartialEq for UnidicReadingIndex {
66 fn eq(&self, other: &Self) -> bool {
67 self.artifact_payload() == other.artifact_payload()
68 }
69}
70
71impl Eq for UnidicReadingIndex {}
72
73#[derive(Clone, Debug)]
74struct IndexedUnidicPayload {
75 mmap: Arc<Mmap>,
76 map: Map<Vec<u8>>,
77 readings_start: usize,
78 entries: usize,
79}
80
81#[derive(Clone, Copy, Debug, Eq, PartialEq)]
83pub struct UnidicIndexedArtifactPayloadHeader {
84 pub version: u32,
86 pub entries: usize,
88 pub fst_len: usize,
90 pub readings_len: usize,
92}
93
94#[derive(Clone, Copy, Debug, Eq, PartialEq)]
96pub struct UnidicBinaryArtifactPayloadHeader {
97 pub version: u32,
99 pub entries: usize,
101}
102
103#[derive(Clone, Copy, Debug, Eq, PartialEq)]
105pub struct DictionaryReadingOptions {
106 pub max_span_chars: usize,
108 pub max_paths: usize,
110 pub longest_match_only: bool,
112 pub max_readings_per_segment: Option<usize>,
114}
115
116#[derive(Clone, Debug, Eq, PartialEq)]
118pub struct DictionaryReadingSegment {
119 pub surface: String,
121 pub reading: String,
123}
124
125#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct DictionaryReadingPath {
128 pub segments: Vec<DictionaryReadingSegment>,
130 pub joined_reading: String,
132}
133
134#[derive(Clone, Debug, Default, Eq, PartialEq)]
136pub struct DictionaryReadingExpansion {
137 pub paths: Vec<DictionaryReadingPath>,
139 pub stats: DictionaryReadingStats,
141}
142
143#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
145pub struct DictionaryReadingStats {
146 pub matched_spans: usize,
148 pub direct_fallback_spans: usize,
150 pub longest_match_pruned_spans: usize,
152 pub raw_segment_readings: usize,
154 pub used_segment_readings: usize,
156 pub pruned_segment_readings: usize,
158 pub candidate_combinations: usize,
160 pub unique_paths: usize,
162 pub duplicate_joined_readings: usize,
164 pub max_paths_hit_count: usize,
166}
167
168pub fn romaji_lattice_from_reading_paths(
170 paths: &[DictionaryReadingPath],
171) -> Result<Lattice, JaLatticeError> {
172 if paths.is_empty() {
173 return Err(JaLatticeError::EmptyReadings);
174 }
175
176 let paths = romaji_symbol_paths_from_reading_segments(
177 paths
178 .iter()
179 .map(|path| path.segments.iter().map(|segment| segment.reading.as_str())),
180 )?;
181 Ok(Lattice::from_symbol_paths_compact(paths))
182}
183
184pub fn romaji_paths_from_reading_paths(
186 paths: &[DictionaryReadingPath],
187) -> Result<Vec<String>, JaLatticeError> {
188 if paths.is_empty() {
189 return Err(JaLatticeError::EmptyReadings);
190 }
191
192 romaji_paths_from_reading_segments(
193 paths
194 .iter()
195 .map(|path| path.segments.iter().map(|segment| segment.reading.as_str())),
196 )
197}
198
199#[derive(Clone, Copy, Debug, Eq, PartialEq)]
201pub enum UnidicReadingField {
202 LForm,
204 Pron,
206}
207
208#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
210pub struct UnidicArtifactMetadata {
211 pub schema_version: u32,
213 pub artifact_type: String,
215 pub artifact_name: String,
217 pub generator: String,
219 pub payload: UnidicArtifactPayload,
221 pub source: UnidicArtifactSource,
223 pub build: UnidicArtifactBuild,
225 pub query_defaults: UnidicArtifactQueryDefaults,
227 pub license: UnidicArtifactLicense,
229}
230
231#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
233pub struct UnidicArtifactPayload {
234 pub path: String,
236 pub format: String,
238 #[serde(default, skip_serializing_if = "Option::is_none")]
240 pub file_digest_algorithm: Option<String>,
241 #[serde(default, skip_serializing_if = "Option::is_none")]
243 pub file_digest: Option<String>,
244 pub checksum_algorithm: String,
246 pub checksum: String,
248}
249
250#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
252pub struct UnidicArtifactSource {
253 pub name: String,
255 pub version: String,
257 pub lex_csv: String,
259}
260
261#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
263pub struct UnidicArtifactBuild {
264 pub reading_field: String,
266 pub max_readings_per_surface: Option<usize>,
268 pub exclude_ascii_surfaces: bool,
270 pub exclude_symbol_pos: bool,
272 pub entries: usize,
274}
275
276#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
278pub struct UnidicArtifactQueryDefaults {
279 pub max_span_chars: usize,
281 pub max_paths: usize,
283 pub longest_match_only: bool,
285 pub max_readings_per_segment: Option<usize>,
287}
288
289#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
291pub struct UnidicArtifactLicense {
292 pub selected_license: String,
294 pub references: Vec<UnidicArtifactLicenseReference>,
296}
297
298#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
300pub struct UnidicArtifactLicenseReference {
301 pub label: String,
303 pub path: String,
305}
306
307#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
309pub struct UnidicReadingIndexPayload {
310 pub schema_version: u32,
312 pub payload_type: String,
314 pub entries: Vec<UnidicReadingIndexPayloadEntry>,
316}
317
318#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
320pub struct UnidicReadingIndexPayloadEntry {
321 pub surface: String,
323 pub readings: Vec<String>,
325}
326
327#[derive(Clone, Debug, Eq, PartialEq)]
329pub struct UnidicArtifactMetadataOptions {
330 pub artifact_name: String,
332 pub generator: String,
334 pub payload_file_name: String,
336 pub payload_format: String,
338 pub source_name: String,
340 pub source_version: String,
342 pub source_lex_csv: String,
344 pub index_options: UnidicIndexOptions,
346 pub query_defaults: DictionaryReadingOptions,
348 pub license: UnidicArtifactLicense,
350}
351
352#[derive(Clone, Copy, Debug, Eq, PartialEq)]
354pub struct UnidicIndexOptions {
355 pub reading_field: UnidicReadingField,
357 pub max_readings_per_surface: Option<usize>,
359 pub exclude_ascii_surfaces: bool,
361 pub exclude_symbol_pos: bool,
363}
364
365impl UnidicReadingField {
366 fn column(self) -> usize {
367 match self {
368 Self::LForm => LFORM_COLUMN,
369 Self::Pron => PRON_COLUMN,
370 }
371 }
372
373 pub fn as_str(self) -> &'static str {
375 match self {
376 Self::LForm => "lform",
377 Self::Pron => "pron",
378 }
379 }
380}
381
382impl Default for UnidicIndexOptions {
383 fn default() -> Self {
384 Self {
385 reading_field: UnidicReadingField::LForm,
386 max_readings_per_surface: None,
387 exclude_ascii_surfaces: true,
388 exclude_symbol_pos: true,
389 }
390 }
391}
392
393impl Default for DictionaryReadingOptions {
394 fn default() -> Self {
395 Self {
396 max_span_chars: 8,
397 max_paths: 1024,
398 longest_match_only: false,
399 max_readings_per_segment: None,
400 }
401 }
402}
403
404impl Default for UnidicArtifactLicense {
405 fn default() -> Self {
406 Self {
407 selected_license: "BSD-3-Clause".to_string(),
408 references: vec![
409 UnidicArtifactLicenseReference {
410 label: "BSD".to_string(),
411 path: "license/BSD".to_string(),
412 },
413 UnidicArtifactLicenseReference {
414 label: "COPYING".to_string(),
415 path: "license/COPYING".to_string(),
416 },
417 ],
418 }
419 }
420}
421
422#[derive(Debug)]
424pub enum UnidicCsvError {
425 Csv(csv::Error),
427 Io(std::io::Error),
429 MissingColumn {
431 record_index: u64,
433 column: usize,
435 len: usize,
437 },
438}
439
440#[derive(Debug)]
442pub enum UnidicArtifactPayloadError {
443 Io(std::io::Error),
445 Yaml(serde_yaml::Error),
447 InvalidBinaryMagic {
449 magic: [u8; 8],
451 },
452 UnsupportedBinaryVersion {
454 version: u32,
456 },
457 NonZeroBinaryReserved {
459 value: u32,
461 },
462 TruncatedBinary {
464 field: &'static str,
466 },
467 InvalidBinaryUtf8 {
469 field: &'static str,
471 source: FromUtf8Error,
473 },
474 BinaryValueTooLarge {
476 field: &'static str,
478 len: usize,
480 },
481 BinaryEntryCountTooLarge {
483 entries: u64,
485 },
486 ArtifactLimitExceeded {
488 field: &'static str,
490 len: u64,
492 max: u64,
494 },
495 InvalidIndexedMagic {
497 magic: [u8; 8],
499 },
500 UnsupportedIndexedVersion {
502 version: u32,
504 },
505 NonZeroIndexedReserved {
507 value: u32,
509 },
510 TruncatedIndexed {
512 field: &'static str,
514 },
515 InvalidIndexedFst {
517 message: String,
519 },
520 IndexedSectionTooLarge {
522 field: &'static str,
524 len: u64,
526 },
527 InvalidIndexedOffset {
529 offset: u64,
531 },
532 InvalidIndexedUtf8 {
534 field: &'static str,
536 source: std::str::Utf8Error,
538 },
539 IndexedEntryCountMismatch {
541 header_entries: usize,
543 fst_entries: usize,
545 },
546 UnsupportedSchemaVersion {
548 version: u32,
550 },
551 UnsupportedPayloadType {
553 payload_type: String,
555 },
556 EmptySurface {
558 entry_index: usize,
560 },
561 DuplicateSurface {
563 surface: String,
565 },
566 EmptyReadings {
568 surface: String,
570 },
571 EmptyReading {
573 surface: String,
575 reading_index: usize,
577 },
578 DuplicateReading {
580 surface: String,
582 reading: String,
584 },
585}
586
587impl fmt::Display for UnidicCsvError {
588 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
589 match self {
590 Self::Csv(err) => write!(f, "invalid UniDic CSV: {err}"),
591 Self::Io(err) => write!(f, "failed to read UniDic CSV: {err}"),
592 Self::MissingColumn {
593 record_index,
594 column,
595 len,
596 } => write!(
597 f,
598 "UniDic CSV record {record_index} has no column {column}; record has {len} columns"
599 ),
600 }
601 }
602}
603
604impl Error for UnidicCsvError {}
605
606impl fmt::Display for UnidicArtifactPayloadError {
607 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
608 match self {
609 Self::Io(err) => write!(f, "failed to read UniDic artifact payload: {err}"),
610 Self::Yaml(err) => write!(f, "invalid UniDic artifact payload YAML: {err}"),
611 Self::InvalidBinaryMagic { magic } => {
612 write!(f, "invalid UniDic binary artifact magic {magic:?}")
613 }
614 Self::UnsupportedBinaryVersion { version } => {
615 write!(f, "unsupported UniDic binary artifact version {version}")
616 }
617 Self::NonZeroBinaryReserved { value } => {
618 write!(f, "UniDic binary artifact reserved header field is {value}")
619 }
620 Self::TruncatedBinary { field } => {
621 write!(f, "truncated UniDic binary artifact while reading {field}")
622 }
623 Self::InvalidBinaryUtf8 { field, source } => {
624 write!(f, "invalid UTF-8 in UniDic binary artifact {field}: {source}")
625 }
626 Self::BinaryValueTooLarge { field, len } => write!(
627 f,
628 "UniDic binary artifact {field} length {len} exceeds u32::MAX"
629 ),
630 Self::BinaryEntryCountTooLarge { entries } => write!(
631 f,
632 "UniDic binary artifact entry count {entries} exceeds usize::MAX"
633 ),
634 Self::ArtifactLimitExceeded { field, len, max } => write!(
635 f,
636 "UniDic artifact {field} length/count {len} exceeds limit {max}"
637 ),
638 Self::InvalidIndexedMagic { magic } => {
639 write!(f, "invalid UniDic indexed artifact magic {magic:?}")
640 }
641 Self::UnsupportedIndexedVersion { version } => {
642 write!(f, "unsupported UniDic indexed artifact version {version}")
643 }
644 Self::NonZeroIndexedReserved { value } => {
645 write!(f, "UniDic indexed artifact reserved header field is {value}")
646 }
647 Self::TruncatedIndexed { field } => {
648 write!(f, "truncated UniDic indexed artifact while reading {field}")
649 }
650 Self::InvalidIndexedFst { message } => {
651 write!(f, "invalid UniDic indexed artifact FST: {message}")
652 }
653 Self::IndexedSectionTooLarge { field, len } => write!(
654 f,
655 "UniDic indexed artifact {field} length {len} exceeds usize::MAX"
656 ),
657 Self::InvalidIndexedOffset { offset } => {
658 write!(f, "invalid UniDic indexed artifact readings offset {offset}")
659 }
660 Self::InvalidIndexedUtf8 { field, source } => {
661 write!(f, "invalid UTF-8 in UniDic indexed artifact {field}: {source}")
662 }
663 Self::IndexedEntryCountMismatch {
664 header_entries,
665 fst_entries,
666 } => write!(
667 f,
668 "UniDic indexed artifact header entry count {header_entries} does not match FST entry count {fst_entries}"
669 ),
670 Self::UnsupportedSchemaVersion { version } => write!(
671 f,
672 "unsupported UniDic artifact payload schema version {version}"
673 ),
674 Self::UnsupportedPayloadType { payload_type } => {
675 write!(f, "unsupported UniDic artifact payload type {payload_type:?}")
676 }
677 Self::EmptySurface { entry_index } => write!(
678 f,
679 "UniDic artifact payload entry {entry_index} has an empty surface"
680 ),
681 Self::DuplicateSurface { surface } => {
682 write!(f, "UniDic artifact payload has duplicate surface {surface:?}")
683 }
684 Self::EmptyReadings { surface } => write!(
685 f,
686 "UniDic artifact payload surface {surface:?} has no readings"
687 ),
688 Self::EmptyReading {
689 surface,
690 reading_index,
691 } => write!(
692 f,
693 "UniDic artifact payload surface {surface:?} has an empty reading at index {reading_index}"
694 ),
695 Self::DuplicateReading { surface, reading } => write!(
696 f,
697 "UniDic artifact payload surface {surface:?} has duplicate reading {reading:?}"
698 ),
699 }
700 }
701}
702
703impl Error for UnidicArtifactPayloadError {
704 fn source(&self) -> Option<&(dyn Error + 'static)> {
705 match self {
706 Self::Io(err) => Some(err),
707 Self::Yaml(err) => Some(err),
708 Self::InvalidBinaryUtf8 { source, .. } => Some(source),
709 Self::InvalidIndexedUtf8 { source, .. } => Some(source),
710 _ => None,
711 }
712 }
713}
714
715impl From<csv::Error> for UnidicCsvError {
716 fn from(err: csv::Error) -> Self {
717 Self::Csv(err)
718 }
719}
720
721impl From<std::io::Error> for UnidicCsvError {
722 fn from(err: std::io::Error) -> Self {
723 Self::Io(err)
724 }
725}
726
727impl From<std::io::Error> for UnidicArtifactPayloadError {
728 fn from(err: std::io::Error) -> Self {
729 Self::Io(err)
730 }
731}
732
733impl From<serde_yaml::Error> for UnidicArtifactPayloadError {
734 fn from(err: serde_yaml::Error) -> Self {
735 Self::Yaml(err)
736 }
737}
738
739impl UnidicReadingIndex {
740 pub fn from_lex_csv_path(path: impl AsRef<Path>) -> Result<Self, UnidicCsvError> {
742 Self::from_lex_csv_path_with_options(path, UnidicIndexOptions::default())
743 }
744
745 pub fn from_lex_csv_path_with_field(
747 path: impl AsRef<Path>,
748 field: UnidicReadingField,
749 ) -> Result<Self, UnidicCsvError> {
750 Self::from_lex_csv_path_with_options(
751 path,
752 UnidicIndexOptions {
753 reading_field: field,
754 ..UnidicIndexOptions::default()
755 },
756 )
757 }
758
759 pub fn from_lex_csv_path_with_options(
761 path: impl AsRef<Path>,
762 options: UnidicIndexOptions,
763 ) -> Result<Self, UnidicCsvError> {
764 let file = File::open(path)?;
765 Self::from_lex_csv_reader_with_options(file, options)
766 }
767
768 pub fn from_lex_csv_reader(reader: impl Read) -> Result<Self, UnidicCsvError> {
770 Self::from_lex_csv_reader_with_options(reader, UnidicIndexOptions::default())
771 }
772
773 pub fn from_lex_csv_reader_with_field(
775 reader: impl Read,
776 reading_field: UnidicReadingField,
777 ) -> Result<Self, UnidicCsvError> {
778 Self::from_lex_csv_reader_with_options(
779 reader,
780 UnidicIndexOptions {
781 reading_field,
782 ..UnidicIndexOptions::default()
783 },
784 )
785 }
786
787 pub fn from_lex_csv_reader_with_options(
789 reader: impl Read,
790 options: UnidicIndexOptions,
791 ) -> Result<Self, UnidicCsvError> {
792 let mut by_surface = HashMap::<String, BTreeSet<String>>::new();
793 for record in lex_csv_reader(reader).records() {
794 let record = record?;
795 let surface = field(&record, SURFACE_COLUMN)?;
796 let reading = field(&record, options.reading_field.column())?;
797
798 if surface == "*" || reading == "*" {
799 continue;
800 }
801 if options.exclude_ascii_surfaces && surface.is_ascii() {
802 continue;
803 }
804 if options.exclude_symbol_pos && is_symbol_pos(field(&record, POS1_COLUMN)?) {
805 continue;
806 }
807
808 by_surface
809 .entry(surface.to_string())
810 .or_default()
811 .insert(reading.to_string());
812 }
813
814 let readings_by_surface = by_surface
815 .into_iter()
816 .map(|(surface, readings)| {
817 let mut readings = readings.into_iter().collect::<Vec<_>>();
818 if let Some(max_readings) = options.max_readings_per_surface {
819 readings.truncate(max_readings);
820 }
821 (surface, readings)
822 })
823 .filter(|(_, readings)| !readings.is_empty())
824 .collect();
825
826 Ok(Self::from_readings_by_surface(readings_by_surface))
827 }
828
829 pub fn from_artifact_payload_path(
831 path: impl AsRef<Path>,
832 ) -> Result<Self, UnidicArtifactPayloadError> {
833 let path = path.as_ref();
834 check_payload_file_size(path)?;
835 let file = File::open(path)?;
836 Self::from_artifact_payload_reader(file)
837 }
838
839 pub fn from_artifact_payload_reader(
841 reader: impl Read,
842 ) -> Result<Self, UnidicArtifactPayloadError> {
843 let payload = serde_yaml::from_reader(reader)?;
844 Self::from_artifact_payload(payload)
845 }
846
847 pub fn from_artifact_payload(
849 payload: UnidicReadingIndexPayload,
850 ) -> Result<Self, UnidicArtifactPayloadError> {
851 validate_artifact_payload_header(&payload)?;
852 check_limit("entry_count", payload.entries.len(), MAX_ARTIFACT_ENTRIES)?;
853
854 let mut readings_by_surface = HashMap::new();
855 for (entry_index, entry) in payload.entries.into_iter().enumerate() {
856 check_limit(
857 "surface_bytes",
858 entry.surface.len(),
859 MAX_ARTIFACT_STRING_BYTES,
860 )?;
861 check_limit(
862 "reading_count",
863 entry.readings.len(),
864 MAX_ARTIFACT_READINGS_PER_ENTRY,
865 )?;
866 if entry.surface.is_empty() {
867 return Err(UnidicArtifactPayloadError::EmptySurface { entry_index });
868 }
869 if entry.readings.is_empty() {
870 return Err(UnidicArtifactPayloadError::EmptyReadings {
871 surface: entry.surface,
872 });
873 }
874
875 let mut seen_readings = BTreeSet::new();
876 for (reading_index, reading) in entry.readings.iter().enumerate() {
877 check_limit("reading_bytes", reading.len(), MAX_ARTIFACT_STRING_BYTES)?;
878 if reading.is_empty() {
879 return Err(UnidicArtifactPayloadError::EmptyReading {
880 surface: entry.surface,
881 reading_index,
882 });
883 }
884 if !seen_readings.insert(reading) {
885 return Err(UnidicArtifactPayloadError::DuplicateReading {
886 surface: entry.surface,
887 reading: reading.clone(),
888 });
889 }
890 }
891
892 if readings_by_surface
893 .insert(entry.surface.clone(), entry.readings)
894 .is_some()
895 {
896 return Err(UnidicArtifactPayloadError::DuplicateSurface {
897 surface: entry.surface,
898 });
899 }
900 }
901
902 Ok(Self::from_readings_by_surface(readings_by_surface))
903 }
904
905 pub fn from_binary_artifact_payload_path(
907 path: impl AsRef<Path>,
908 ) -> Result<Self, UnidicArtifactPayloadError> {
909 let path = path.as_ref();
910 check_payload_file_size(path)?;
911 let file = File::open(path)?;
912 Self::from_binary_artifact_payload_reader(file)
913 }
914
915 pub fn from_binary_artifact_payload_reader(
917 mut reader: impl Read,
918 ) -> Result<Self, UnidicArtifactPayloadError> {
919 let header = read_binary_artifact_payload_header(&mut reader)?;
920 check_limit("entry_count", header.entries, MAX_ARTIFACT_ENTRIES)?;
921 let mut entries = Vec::with_capacity(header.entries);
922 for _ in 0..header.entries {
923 let surface = read_binary_string(&mut reader, "surface")?;
924 let reading_count = read_u32_le(&mut reader, "reading_count")?;
925 let reading_count = usize::try_from(reading_count).expect("u32 fits usize");
926 check_limit(
927 "reading_count",
928 reading_count,
929 MAX_ARTIFACT_READINGS_PER_ENTRY,
930 )?;
931 let mut readings = Vec::with_capacity(reading_count);
932 for _ in 0..reading_count {
933 readings.push(read_binary_string(&mut reader, "reading")?);
934 }
935 entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
936 }
937
938 Self::from_artifact_payload(UnidicReadingIndexPayload {
939 schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
940 payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
941 entries,
942 })
943 }
944
945 pub fn from_indexed_artifact_payload_path(
947 path: impl AsRef<Path>,
948 ) -> Result<Self, UnidicArtifactPayloadError> {
949 let path = path.as_ref();
950 check_payload_file_size(path)?;
951 let file = File::open(path)?;
952 let mmap = unsafe { Mmap::map(&file)? };
955 Self::from_indexed_mmap(mmap)
956 }
957
958 pub fn from_indexed_artifact_payload_bytes(
969 bytes: &[u8],
970 ) -> Result<Self, UnidicArtifactPayloadError> {
971 if bytes.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
972 return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
973 field: "payload_bytes",
974 len: bytes.len() as u64,
975 max: MAX_ARTIFACT_PAYLOAD_BYTES,
976 });
977 }
978 let header = read_indexed_artifact_payload_header_bytes(bytes)?;
979 let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
980 let fst_end = fst_start.checked_add(header.fst_len).ok_or(
981 UnidicArtifactPayloadError::TruncatedIndexed {
982 field: "fst_section",
983 },
984 )?;
985 let readings_end = fst_end.checked_add(header.readings_len).ok_or(
986 UnidicArtifactPayloadError::TruncatedIndexed {
987 field: "readings_section",
988 },
989 )?;
990 if bytes.len() < readings_end {
991 return Err(UnidicArtifactPayloadError::TruncatedIndexed {
992 field: "indexed_payload",
993 });
994 }
995
996 let map = Map::new(bytes[fst_start..fst_end].to_vec()).map_err(|err| {
997 UnidicArtifactPayloadError::InvalidIndexedFst {
998 message: err.to_string(),
999 }
1000 })?;
1001 let fst_entries = map.len();
1002 if fst_entries != header.entries {
1003 return Err(UnidicArtifactPayloadError::IndexedEntryCountMismatch {
1004 header_entries: header.entries,
1005 fst_entries,
1006 });
1007 }
1008
1009 let mut entries = Vec::with_capacity(header.entries);
1010 let mut stream = map.stream();
1011 while let Some((surface, offset)) = stream.next() {
1012 let surface = std::str::from_utf8(surface)
1013 .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
1014 field: "surface",
1015 source,
1016 })?
1017 .to_string();
1018 let readings = read_indexed_readings_at_bytes(bytes, fst_end, offset)?;
1019 entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
1020 }
1021
1022 Self::from_artifact_payload(UnidicReadingIndexPayload {
1023 schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1024 payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1025 entries,
1026 })
1027 }
1028
1029 fn from_indexed_mmap(mmap: Mmap) -> Result<Self, UnidicArtifactPayloadError> {
1030 if mmap.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
1031 return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1032 field: "payload_bytes",
1033 len: mmap.len() as u64,
1034 max: MAX_ARTIFACT_PAYLOAD_BYTES,
1035 });
1036 }
1037 let header = read_indexed_artifact_payload_header_bytes(&mmap)?;
1038 let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
1039 let fst_end = fst_start.checked_add(header.fst_len).ok_or(
1040 UnidicArtifactPayloadError::TruncatedIndexed {
1041 field: "fst_section",
1042 },
1043 )?;
1044 let readings_end = fst_end.checked_add(header.readings_len).ok_or(
1045 UnidicArtifactPayloadError::TruncatedIndexed {
1046 field: "readings_section",
1047 },
1048 )?;
1049 if mmap.len() < readings_end {
1050 return Err(UnidicArtifactPayloadError::TruncatedIndexed {
1051 field: "indexed_payload",
1052 });
1053 }
1054
1055 let map = Map::new(mmap[fst_start..fst_end].to_vec()).map_err(|err| {
1056 UnidicArtifactPayloadError::InvalidIndexedFst {
1057 message: err.to_string(),
1058 }
1059 })?;
1060 let fst_entries = map.len();
1061 if fst_entries != header.entries {
1062 return Err(UnidicArtifactPayloadError::IndexedEntryCountMismatch {
1063 header_entries: header.entries,
1064 fst_entries,
1065 });
1066 }
1067
1068 let indexed = IndexedUnidicPayload {
1069 mmap: Arc::new(mmap),
1070 map,
1071 readings_start: fst_end,
1072 entries: header.entries,
1073 };
1074 indexed.validate()?;
1075 Ok(Self {
1076 storage: UnidicReadingStorage::Indexed(indexed),
1077 })
1078 }
1079
1080 pub fn binary_artifact_payload_header_path(
1082 path: impl AsRef<Path>,
1083 ) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1084 let file = File::open(path)?;
1085 Self::binary_artifact_payload_header_reader(file)
1086 }
1087
1088 pub fn binary_artifact_payload_header_reader(
1090 mut reader: impl Read,
1091 ) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1092 read_binary_artifact_payload_header(&mut reader)
1093 }
1094
1095 fn from_readings_by_surface(readings_by_surface: HashMap<String, Vec<String>>) -> Self {
1096 Self {
1097 storage: UnidicReadingStorage::Eager(readings_by_surface),
1098 }
1099 }
1100
1101 pub fn readings(&self, surface: &str) -> Option<Cow<'_, [String]>> {
1107 self.try_readings(surface).ok().flatten()
1108 }
1109
1110 pub fn try_readings(
1113 &self,
1114 surface: &str,
1115 ) -> Result<Option<Cow<'_, [String]>>, UnidicArtifactPayloadError> {
1116 match &self.storage {
1117 UnidicReadingStorage::Eager(readings_by_surface) => Ok(readings_by_surface
1118 .get(surface)
1119 .map(|readings| Cow::Borrowed(readings.as_slice()))),
1120 UnidicReadingStorage::Indexed(indexed) => indexed
1121 .readings(surface)
1122 .map(|readings| readings.map(Cow::Owned)),
1123 }
1124 }
1125
1126 pub fn len(&self) -> usize {
1128 match &self.storage {
1129 UnidicReadingStorage::Eager(readings_by_surface) => readings_by_surface.len(),
1130 UnidicReadingStorage::Indexed(indexed) => indexed.entries,
1131 }
1132 }
1133
1134 pub fn is_empty(&self) -> bool {
1136 self.len() == 0
1137 }
1138
1139 pub fn artifact_metadata(
1145 &self,
1146 options: UnidicArtifactMetadataOptions,
1147 ) -> UnidicArtifactMetadata {
1148 UnidicArtifactMetadata {
1149 schema_version: 1,
1150 artifact_type: "moine.unidic.reading-index".to_string(),
1151 artifact_name: options.artifact_name,
1152 generator: options.generator,
1153 payload: UnidicArtifactPayload {
1154 path: options.payload_file_name,
1155 format: options.payload_format,
1156 file_digest_algorithm: None,
1157 file_digest: None,
1158 checksum_algorithm: ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM.to_string(),
1159 checksum: self.artifact_payload_checksum(),
1160 },
1161 source: UnidicArtifactSource {
1162 name: options.source_name,
1163 version: options.source_version,
1164 lex_csv: options.source_lex_csv,
1165 },
1166 build: UnidicArtifactBuild {
1167 reading_field: options.index_options.reading_field.as_str().to_string(),
1168 max_readings_per_surface: options.index_options.max_readings_per_surface,
1169 exclude_ascii_surfaces: options.index_options.exclude_ascii_surfaces,
1170 exclude_symbol_pos: options.index_options.exclude_symbol_pos,
1171 entries: self.len(),
1172 },
1173 query_defaults: UnidicArtifactQueryDefaults {
1174 max_span_chars: options.query_defaults.max_span_chars,
1175 max_paths: options.query_defaults.max_paths,
1176 longest_match_only: options.query_defaults.longest_match_only,
1177 max_readings_per_segment: options.query_defaults.max_readings_per_segment,
1178 },
1179 license: options.license,
1180 }
1181 }
1182
1183 pub fn artifact_payload(&self) -> UnidicReadingIndexPayload {
1188 let entries = match &self.storage {
1189 UnidicReadingStorage::Eager(readings_by_surface) => {
1190 let mut entries = readings_by_surface
1191 .iter()
1192 .map(|(surface, readings)| UnidicReadingIndexPayloadEntry {
1193 surface: surface.clone(),
1194 readings: readings.clone(),
1195 })
1196 .collect::<Vec<_>>();
1197 entries.sort_by(|left, right| left.surface.cmp(&right.surface));
1198 entries
1199 }
1200 UnidicReadingStorage::Indexed(indexed) => indexed
1201 .entries()
1202 .expect("validated indexed artifact should decode"),
1203 };
1204
1205 UnidicReadingIndexPayload {
1206 schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1207 payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1208 entries,
1209 }
1210 }
1211
1212 pub fn artifact_payload_checksum(&self) -> String {
1214 self.artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
1215 .expect("default artifact checksum algorithm should be supported")
1216 }
1217
1218 pub fn artifact_payload_checksum_for_algorithm(&self, algorithm: &str) -> Option<String> {
1224 let payload = self.artifact_payload();
1225 let bytes = canonical_payload_bytes(&payload);
1226 match algorithm {
1227 ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(sha256_hex(&bytes)),
1228 LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(format!("{:016x}", fnv1a64(&bytes))),
1229 _ => None,
1230 }
1231 }
1232
1233 pub fn write_artifact_binary_payload(
1238 &self,
1239 mut writer: impl Write,
1240 ) -> Result<(), UnidicArtifactPayloadError> {
1241 let payload = self.artifact_payload();
1242 writer.write_all(BINARY_ARTIFACT_MAGIC)?;
1243 writer.write_all(&BINARY_ARTIFACT_VERSION.to_le_bytes())?;
1244 writer.write_all(&0_u32.to_le_bytes())?;
1245 writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1246
1247 for entry in &payload.entries {
1248 write_binary_string(&mut writer, "surface", &entry.surface)?;
1249 write_u32_len(&mut writer, "reading_count", entry.readings.len())?;
1250 for reading in &entry.readings {
1251 write_binary_string(&mut writer, "reading", reading)?;
1252 }
1253 }
1254
1255 Ok(())
1256 }
1257
1258 pub fn write_indexed_artifact_payload(
1264 &self,
1265 mut writer: impl Write,
1266 ) -> Result<(), UnidicArtifactPayloadError> {
1267 let payload = self.artifact_payload();
1268 let mut fst_bytes = Vec::new();
1269 let mut readings_bytes = Vec::new();
1270 {
1271 let mut builder = MapBuilder::new(&mut fst_bytes).map_err(|err| {
1272 UnidicArtifactPayloadError::InvalidIndexedFst {
1273 message: err.to_string(),
1274 }
1275 })?;
1276 for entry in &payload.entries {
1277 let offset = readings_bytes.len() as u64;
1278 builder.insert(&entry.surface, offset).map_err(|err| {
1279 UnidicArtifactPayloadError::InvalidIndexedFst {
1280 message: err.to_string(),
1281 }
1282 })?;
1283 write_indexed_reading_block(&mut readings_bytes, &entry.readings)?;
1284 }
1285 builder
1286 .finish()
1287 .map_err(|err| UnidicArtifactPayloadError::InvalidIndexedFst {
1288 message: err.to_string(),
1289 })?;
1290 }
1291
1292 writer.write_all(INDEXED_ARTIFACT_MAGIC)?;
1293 writer.write_all(&INDEXED_ARTIFACT_VERSION.to_le_bytes())?;
1294 writer.write_all(&0_u32.to_le_bytes())?;
1295 writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1296 writer.write_all(&(fst_bytes.len() as u64).to_le_bytes())?;
1297 writer.write_all(&(readings_bytes.len() as u64).to_le_bytes())?;
1298 writer.write_all(&fst_bytes)?;
1299 writer.write_all(&readings_bytes)?;
1300 Ok(())
1301 }
1302
1303 pub fn reading_sequences(&self, text: &str, options: DictionaryReadingOptions) -> Vec<String> {
1309 self.reading_sequences_with_stats_inner(text, options, false)
1310 .unwrap_or_default()
1311 .paths
1312 }
1313
1314 pub fn reading_paths(
1320 &self,
1321 text: &str,
1322 options: DictionaryReadingOptions,
1323 ) -> Vec<DictionaryReadingPath> {
1324 self.reading_paths_with_stats(text, options).paths
1325 }
1326
1327 pub fn reading_paths_with_stats(
1333 &self,
1334 text: &str,
1335 options: DictionaryReadingOptions,
1336 ) -> DictionaryReadingExpansion {
1337 self.try_reading_paths_with_stats(text, options)
1338 .unwrap_or_default()
1339 }
1340
1341 pub fn try_reading_paths_with_stats(
1344 &self,
1345 text: &str,
1346 options: DictionaryReadingOptions,
1347 ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1348 self.reading_paths_with_stats_inner(text, options, false)
1349 }
1350
1351 pub fn hybrid_reading_paths(
1357 &self,
1358 text: &str,
1359 options: DictionaryReadingOptions,
1360 ) -> Vec<DictionaryReadingPath> {
1361 self.hybrid_reading_paths_with_stats(text, options).paths
1362 }
1363
1364 pub fn hybrid_reading_paths_with_stats(
1370 &self,
1371 text: &str,
1372 options: DictionaryReadingOptions,
1373 ) -> DictionaryReadingExpansion {
1374 self.try_hybrid_reading_paths_with_stats(text, options)
1375 .unwrap_or_default()
1376 }
1377
1378 pub fn try_hybrid_reading_paths_with_stats(
1381 &self,
1382 text: &str,
1383 options: DictionaryReadingOptions,
1384 ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1385 self.reading_paths_with_stats_inner(text, options, true)
1386 }
1387
1388 fn reading_paths_with_stats_inner(
1389 &self,
1390 text: &str,
1391 options: DictionaryReadingOptions,
1392 allow_direct_fallback: bool,
1393 ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1394 if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1395 return Ok(DictionaryReadingExpansion::default());
1396 }
1397
1398 let mut stats = DictionaryReadingStats::default();
1399 let boundaries = char_boundaries(text);
1400 let char_len = boundaries.len() - 1;
1401 let mut suffix_paths = vec![Vec::<DictionaryReadingPath>::new(); char_len + 1];
1402 suffix_paths[char_len].push(DictionaryReadingPath {
1403 segments: Vec::new(),
1404 joined_reading: String::new(),
1405 });
1406
1407 for start in (0..char_len).rev() {
1408 let mut paths_by_reading = std::collections::BTreeMap::new();
1409 let end_limit = char_len.min(start + options.max_span_chars);
1410 let mut matching_ends = Vec::new();
1411
1412 for end in start + 1..=end_limit {
1413 let surface = &text[boundaries[start]..boundaries[end]];
1414 if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1415 matching_ends.push(end);
1416 }
1417 }
1418 stats.matched_spans += matching_ends.len();
1419
1420 if options.longest_match_only && !allow_direct_fallback {
1421 if let Some(end) = matching_ends.last().copied() {
1422 stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1423 matching_ends.clear();
1424 matching_ends.push(end);
1425 }
1426 }
1427
1428 for end in matching_ends {
1429 let surface = &text[boundaries[start]..boundaries[end]];
1430 let Some(surface_readings) = self.try_readings(surface)? else {
1431 continue;
1432 };
1433
1434 stats.raw_segment_readings += surface_readings.len();
1435 let raw_surface_reading_count = surface_readings.len();
1436 let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1437 stats.used_segment_readings += surface_readings.len();
1438 stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1439 for surface_reading in surface_readings {
1440 for suffix in &suffix_paths[end] {
1441 stats.candidate_combinations += 1;
1442 let mut reading = String::with_capacity(
1443 surface_reading.len() + suffix.joined_reading.len(),
1444 );
1445 reading.push_str(surface_reading);
1446 reading.push_str(&suffix.joined_reading);
1447
1448 let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1449 segments.push(DictionaryReadingSegment {
1450 surface: surface.to_string(),
1451 reading: surface_reading.to_string(),
1452 });
1453 segments.extend(suffix.segments.iter().cloned());
1454
1455 match paths_by_reading.entry(reading.clone()) {
1456 Entry::Vacant(entry) => {
1457 entry.insert(DictionaryReadingPath {
1458 segments,
1459 joined_reading: reading,
1460 });
1461 stats.unique_paths += 1;
1462 }
1463 Entry::Occupied(_) => {
1464 stats.duplicate_joined_readings += 1;
1465 }
1466 }
1467
1468 if paths_by_reading.len() >= options.max_paths {
1469 stats.max_paths_hit_count += 1;
1470 break;
1471 }
1472 }
1473
1474 if paths_by_reading.len() >= options.max_paths {
1475 break;
1476 }
1477 }
1478
1479 if paths_by_reading.len() >= options.max_paths {
1480 break;
1481 }
1482 }
1483
1484 if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1485 if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1486 if !suffix_paths[end].is_empty() {
1487 stats.direct_fallback_spans += 1;
1488 let surface = &text[boundaries[start]..boundaries[end]];
1489 for suffix in &suffix_paths[end] {
1490 stats.candidate_combinations += 1;
1491 let mut reading =
1492 String::with_capacity(surface.len() + suffix.joined_reading.len());
1493 reading.push_str(surface);
1494 reading.push_str(&suffix.joined_reading);
1495
1496 let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1497 segments.push(DictionaryReadingSegment {
1498 surface: surface.to_string(),
1499 reading: surface.to_string(),
1500 });
1501 segments.extend(suffix.segments.iter().cloned());
1502
1503 match paths_by_reading.entry(reading.clone()) {
1504 Entry::Vacant(entry) => {
1505 entry.insert(DictionaryReadingPath {
1506 segments,
1507 joined_reading: reading,
1508 });
1509 stats.unique_paths += 1;
1510 }
1511 Entry::Occupied(_) => {
1512 stats.duplicate_joined_readings += 1;
1513 }
1514 }
1515
1516 if paths_by_reading.len() >= options.max_paths {
1517 stats.max_paths_hit_count += 1;
1518 break;
1519 }
1520 }
1521 }
1522 }
1523 }
1524
1525 suffix_paths[start] = paths_by_reading.into_values().collect();
1526 }
1527
1528 Ok(DictionaryReadingExpansion {
1529 paths: suffix_paths.remove(0),
1530 stats,
1531 })
1532 }
1533
1534 fn reading_sequences_with_stats_inner(
1535 &self,
1536 text: &str,
1537 options: DictionaryReadingOptions,
1538 allow_direct_fallback: bool,
1539 ) -> Result<DictionaryReadingSequenceExpansion, UnidicArtifactPayloadError> {
1540 if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1541 return Ok(DictionaryReadingSequenceExpansion::default());
1542 }
1543
1544 let mut stats = DictionaryReadingStats::default();
1545 let boundaries = char_boundaries(text);
1546 let char_len = boundaries.len() - 1;
1547 let mut suffix_paths = vec![Vec::<String>::new(); char_len + 1];
1548 suffix_paths[char_len].push(String::new());
1549
1550 for start in (0..char_len).rev() {
1551 let mut paths_by_reading = BTreeSet::new();
1552 let end_limit = char_len.min(start + options.max_span_chars);
1553 let mut matching_ends = Vec::new();
1554
1555 for end in start + 1..=end_limit {
1556 let surface = &text[boundaries[start]..boundaries[end]];
1557 if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1558 matching_ends.push(end);
1559 }
1560 }
1561 stats.matched_spans += matching_ends.len();
1562
1563 if options.longest_match_only && !allow_direct_fallback {
1564 if let Some(end) = matching_ends.last().copied() {
1565 stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1566 matching_ends.clear();
1567 matching_ends.push(end);
1568 }
1569 }
1570
1571 for end in matching_ends {
1572 let surface = &text[boundaries[start]..boundaries[end]];
1573 let Some(surface_readings) = self.try_readings(surface)? else {
1574 continue;
1575 };
1576
1577 stats.raw_segment_readings += surface_readings.len();
1578 let raw_surface_reading_count = surface_readings.len();
1579 let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1580 stats.used_segment_readings += surface_readings.len();
1581 stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1582 for surface_reading in surface_readings {
1583 for suffix in &suffix_paths[end] {
1584 stats.candidate_combinations += 1;
1585 let mut reading =
1586 String::with_capacity(surface_reading.len() + suffix.len());
1587 reading.push_str(surface_reading);
1588 reading.push_str(suffix);
1589
1590 if paths_by_reading.insert(reading) {
1591 stats.unique_paths += 1;
1592 } else {
1593 stats.duplicate_joined_readings += 1;
1594 }
1595
1596 if paths_by_reading.len() >= options.max_paths {
1597 stats.max_paths_hit_count += 1;
1598 break;
1599 }
1600 }
1601
1602 if paths_by_reading.len() >= options.max_paths {
1603 break;
1604 }
1605 }
1606
1607 if paths_by_reading.len() >= options.max_paths {
1608 break;
1609 }
1610 }
1611
1612 if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1613 if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1614 if !suffix_paths[end].is_empty() {
1615 stats.direct_fallback_spans += 1;
1616 let surface = &text[boundaries[start]..boundaries[end]];
1617 for suffix in &suffix_paths[end] {
1618 stats.candidate_combinations += 1;
1619 let mut reading = String::with_capacity(surface.len() + suffix.len());
1620 reading.push_str(surface);
1621 reading.push_str(suffix);
1622
1623 if paths_by_reading.insert(reading) {
1624 stats.unique_paths += 1;
1625 } else {
1626 stats.duplicate_joined_readings += 1;
1627 }
1628
1629 if paths_by_reading.len() >= options.max_paths {
1630 stats.max_paths_hit_count += 1;
1631 break;
1632 }
1633 }
1634 }
1635 }
1636 }
1637
1638 suffix_paths[start] = paths_by_reading.into_iter().collect();
1639 }
1640
1641 Ok(DictionaryReadingSequenceExpansion {
1642 paths: suffix_paths.remove(0),
1643 stats,
1644 })
1645 }
1646
1647 pub fn romaji_lattice(
1653 &self,
1654 text: &str,
1655 options: DictionaryReadingOptions,
1656 ) -> Result<Option<Lattice>, JaLatticeError> {
1657 let readings = self
1658 .reading_sequences_with_stats_inner(text, options, false)
1659 .map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?;
1660 if readings.paths.is_empty() {
1661 return Ok(None);
1662 }
1663
1664 crate::romaji::romaji_lattice_from_readings(readings.paths).map(Some)
1665 }
1666
1667 pub fn hybrid_romaji_lattice(
1672 &self,
1673 text: &str,
1674 options: DictionaryReadingOptions,
1675 ) -> Result<Option<Lattice>, JaLatticeError> {
1676 let readings = self
1677 .reading_sequences_with_stats_inner(text, options, true)
1678 .map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?;
1679 if readings.paths.is_empty() {
1680 return Ok(None);
1681 }
1682
1683 crate::romaji::romaji_lattice_from_readings(readings.paths).map(Some)
1684 }
1685}
1686
1687#[derive(Clone, Debug, Default, Eq, PartialEq)]
1688struct DictionaryReadingSequenceExpansion {
1689 paths: Vec<String>,
1690 stats: DictionaryReadingStats,
1691}
1692
1693fn char_boundaries(text: &str) -> Vec<usize> {
1694 text.char_indices()
1695 .map(|(index, _)| index)
1696 .chain(std::iter::once(text.len()))
1697 .collect()
1698}
1699
1700fn lex_csv_reader(reader: impl Read) -> csv::Reader<impl Read> {
1701 csv::ReaderBuilder::new()
1702 .has_headers(false)
1703 .flexible(true)
1704 .from_reader(reader)
1705}
1706
1707fn field(record: &csv::StringRecord, column: usize) -> Result<&str, UnidicCsvError> {
1708 record
1709 .get(column)
1710 .ok_or_else(|| UnidicCsvError::MissingColumn {
1711 record_index: record
1712 .position()
1713 .map(|position| position.record())
1714 .unwrap_or(0),
1715 column,
1716 len: record.len(),
1717 })
1718}
1719
1720fn is_symbol_pos(pos1: &str) -> bool {
1721 pos1.contains("記号")
1722}
1723
1724fn limited_surface_readings(readings: &[String], options: DictionaryReadingOptions) -> &[String] {
1725 if let Some(max_readings) = options.max_readings_per_segment {
1726 &readings[..readings.len().min(max_readings)]
1727 } else {
1728 readings
1729 }
1730}
1731
1732fn direct_fallback_end(
1733 text: &str,
1734 boundaries: &[usize],
1735 start: usize,
1736 char_len: usize,
1737) -> Option<usize> {
1738 let mut end = start;
1739 while end < char_len {
1740 let surface = &text[boundaries[start]..boundaries[end + 1]];
1741 if !can_build_romaji_paths(surface) {
1742 break;
1743 }
1744 end += 1;
1745 }
1746
1747 (end > start).then_some(end)
1748}
1749
1750fn write_binary_string(
1751 writer: &mut impl Write,
1752 field: &'static str,
1753 value: &str,
1754) -> Result<(), UnidicArtifactPayloadError> {
1755 write_u32_len(writer, field, value.len())?;
1756 writer.write_all(value.as_bytes())?;
1757 Ok(())
1758}
1759
1760fn write_u32_len(
1761 writer: &mut impl Write,
1762 field: &'static str,
1763 len: usize,
1764) -> Result<(), UnidicArtifactPayloadError> {
1765 let len = u32::try_from(len)
1766 .map_err(|_| UnidicArtifactPayloadError::BinaryValueTooLarge { field, len })?;
1767 writer.write_all(&len.to_le_bytes())?;
1768 Ok(())
1769}
1770
1771fn read_binary_string(
1772 reader: &mut impl Read,
1773 field: &'static str,
1774) -> Result<String, UnidicArtifactPayloadError> {
1775 let len = read_u32_le(reader, field)? as usize;
1776 check_limit(field, len, MAX_ARTIFACT_STRING_BYTES)?;
1777 let mut bytes = vec![0_u8; len];
1778 read_exact_binary(reader, &mut bytes, field)?;
1779 String::from_utf8(bytes)
1780 .map_err(|source| UnidicArtifactPayloadError::InvalidBinaryUtf8 { field, source })
1781}
1782
1783fn read_u32_le(
1784 reader: &mut impl Read,
1785 field: &'static str,
1786) -> Result<u32, UnidicArtifactPayloadError> {
1787 let mut bytes = [0_u8; 4];
1788 read_exact_binary(reader, &mut bytes, field)?;
1789 Ok(u32::from_le_bytes(bytes))
1790}
1791
1792fn read_u64_le(
1793 reader: &mut impl Read,
1794 field: &'static str,
1795) -> Result<u64, UnidicArtifactPayloadError> {
1796 let mut bytes = [0_u8; 8];
1797 read_exact_binary(reader, &mut bytes, field)?;
1798 Ok(u64::from_le_bytes(bytes))
1799}
1800
1801fn read_exact_binary(
1802 reader: &mut impl Read,
1803 bytes: &mut [u8],
1804 field: &'static str,
1805) -> Result<(), UnidicArtifactPayloadError> {
1806 match reader.read_exact(bytes) {
1807 Ok(()) => Ok(()),
1808 Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => {
1809 Err(UnidicArtifactPayloadError::TruncatedBinary { field })
1810 }
1811 Err(err) => Err(UnidicArtifactPayloadError::Io(err)),
1812 }
1813}
1814
1815fn read_binary_artifact_payload_header(
1816 reader: &mut impl Read,
1817) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1818 let mut magic = [0_u8; 8];
1819 read_exact_binary(reader, &mut magic, "magic")?;
1820 if &magic != BINARY_ARTIFACT_MAGIC {
1821 return Err(UnidicArtifactPayloadError::InvalidBinaryMagic { magic });
1822 }
1823
1824 let version = read_u32_le(reader, "version")?;
1825 if version != BINARY_ARTIFACT_VERSION {
1826 return Err(UnidicArtifactPayloadError::UnsupportedBinaryVersion { version });
1827 }
1828
1829 let reserved = read_u32_le(reader, "reserved")?;
1830 if reserved != 0 {
1831 return Err(UnidicArtifactPayloadError::NonZeroBinaryReserved { value: reserved });
1832 }
1833
1834 let entry_count = read_u64_le(reader, "entry_count")?;
1835 let entries = usize::try_from(entry_count).map_err(|_| {
1836 UnidicArtifactPayloadError::BinaryEntryCountTooLarge {
1837 entries: entry_count,
1838 }
1839 })?;
1840 check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1841
1842 Ok(UnidicBinaryArtifactPayloadHeader { version, entries })
1843}
1844
1845fn read_indexed_artifact_payload_header_bytes(
1846 bytes: &[u8],
1847) -> Result<UnidicIndexedArtifactPayloadHeader, UnidicArtifactPayloadError> {
1848 if bytes.len() < INDEXED_ARTIFACT_HEADER_LEN {
1849 return Err(UnidicArtifactPayloadError::TruncatedIndexed { field: "header" });
1850 }
1851 let mut magic = [0_u8; 8];
1852 magic.copy_from_slice(&bytes[..8]);
1853 if &magic != INDEXED_ARTIFACT_MAGIC {
1854 return Err(UnidicArtifactPayloadError::InvalidIndexedMagic { magic });
1855 }
1856
1857 let version = read_u32_le_bytes(bytes, 8, "version")?;
1858 if version != INDEXED_ARTIFACT_VERSION {
1859 return Err(UnidicArtifactPayloadError::UnsupportedIndexedVersion { version });
1860 }
1861 let reserved = read_u32_le_bytes(bytes, 12, "reserved")?;
1862 if reserved != 0 {
1863 return Err(UnidicArtifactPayloadError::NonZeroIndexedReserved { value: reserved });
1864 }
1865 let entry_count = read_u64_le_bytes(bytes, 16, "entry_count")?;
1866 let fst_len = read_u64_le_bytes(bytes, 24, "fst_len")?;
1867 let readings_len = read_u64_le_bytes(bytes, 32, "readings_len")?;
1868 let entries = checked_indexed_usize("entry_count", entry_count)?;
1869 check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1870 Ok(UnidicIndexedArtifactPayloadHeader {
1871 version,
1872 entries,
1873 fst_len: checked_indexed_usize("fst_len", fst_len)?,
1874 readings_len: checked_indexed_usize("readings_len", readings_len)?,
1875 })
1876}
1877
1878fn read_u32_le_bytes(
1879 bytes: &[u8],
1880 offset: usize,
1881 field: &'static str,
1882) -> Result<u32, UnidicArtifactPayloadError> {
1883 let end = offset
1884 .checked_add(4)
1885 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1886 let chunk = bytes
1887 .get(offset..end)
1888 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1889 Ok(u32::from_le_bytes(
1890 chunk.try_into().expect("slice length is 4"),
1891 ))
1892}
1893
1894fn read_u64_le_bytes(
1895 bytes: &[u8],
1896 offset: usize,
1897 field: &'static str,
1898) -> Result<u64, UnidicArtifactPayloadError> {
1899 let end = offset
1900 .checked_add(8)
1901 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1902 let chunk = bytes
1903 .get(offset..end)
1904 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1905 Ok(u64::from_le_bytes(
1906 chunk.try_into().expect("slice length is 8"),
1907 ))
1908}
1909
1910fn checked_indexed_usize(
1911 field: &'static str,
1912 len: u64,
1913) -> Result<usize, UnidicArtifactPayloadError> {
1914 usize::try_from(len)
1915 .map_err(|_| UnidicArtifactPayloadError::IndexedSectionTooLarge { field, len })
1916}
1917
1918fn check_payload_file_size(path: &Path) -> Result<(), UnidicArtifactPayloadError> {
1919 let len = std::fs::metadata(path)?.len();
1920 if len > MAX_ARTIFACT_PAYLOAD_BYTES {
1921 return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1922 field: "payload_bytes",
1923 len,
1924 max: MAX_ARTIFACT_PAYLOAD_BYTES,
1925 });
1926 }
1927 Ok(())
1928}
1929
1930fn check_limit(
1931 field: &'static str,
1932 len: usize,
1933 max: usize,
1934) -> Result<(), UnidicArtifactPayloadError> {
1935 if len > max {
1936 return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1937 field,
1938 len: len as u64,
1939 max: max as u64,
1940 });
1941 }
1942 Ok(())
1943}
1944
1945fn write_indexed_reading_block(
1946 writer: &mut Vec<u8>,
1947 readings: &[String],
1948) -> Result<(), UnidicArtifactPayloadError> {
1949 write_u32_len(writer, "reading_count", readings.len())?;
1950 for reading in readings {
1951 write_binary_string(writer, "reading", reading)?;
1952 }
1953 Ok(())
1954}
1955
1956impl IndexedUnidicPayload {
1957 fn validate(&self) -> Result<(), UnidicArtifactPayloadError> {
1958 let mut stream = self.map.stream();
1959 while let Some((surface, offset)) = stream.next() {
1960 let surface = std::str::from_utf8(surface).map_err(|source| {
1961 UnidicArtifactPayloadError::InvalidIndexedUtf8 {
1962 field: "surface",
1963 source,
1964 }
1965 })?;
1966 if surface.is_empty() {
1967 return Err(UnidicArtifactPayloadError::EmptySurface { entry_index: 0 });
1968 }
1969 let readings = self.readings_at(offset)?;
1970 if readings.is_empty() {
1971 return Err(UnidicArtifactPayloadError::EmptyReadings {
1972 surface: surface.to_string(),
1973 });
1974 }
1975 let mut seen = BTreeSet::new();
1976 for (reading_index, reading) in readings.iter().enumerate() {
1977 if reading.is_empty() {
1978 return Err(UnidicArtifactPayloadError::EmptyReading {
1979 surface: surface.to_string(),
1980 reading_index,
1981 });
1982 }
1983 if !seen.insert(reading) {
1984 return Err(UnidicArtifactPayloadError::DuplicateReading {
1985 surface: surface.to_string(),
1986 reading: reading.clone(),
1987 });
1988 }
1989 }
1990 }
1991 Ok(())
1992 }
1993
1994 fn readings(&self, surface: &str) -> Result<Option<Vec<String>>, UnidicArtifactPayloadError> {
1995 self.map
1996 .get(surface)
1997 .map(|offset| self.readings_at(offset))
1998 .transpose()
1999 }
2000
2001 fn entries(&self) -> Result<Vec<UnidicReadingIndexPayloadEntry>, UnidicArtifactPayloadError> {
2002 let mut entries = Vec::with_capacity(self.entries);
2003 let mut stream = self.map.stream();
2004 while let Some((surface, offset)) = stream.next() {
2005 let surface = std::str::from_utf8(surface)
2006 .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
2007 field: "surface",
2008 source,
2009 })?
2010 .to_string();
2011 let readings = self.readings_at(offset)?;
2012 entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
2013 }
2014 Ok(entries)
2015 }
2016
2017 fn readings_at(&self, offset: u64) -> Result<Vec<String>, UnidicArtifactPayloadError> {
2018 read_indexed_readings_at_bytes(&self.mmap, self.readings_start, offset)
2019 }
2020}
2021
2022fn read_indexed_readings_at_bytes(
2023 bytes: &[u8],
2024 readings_start: usize,
2025 offset: u64,
2026) -> Result<Vec<String>, UnidicArtifactPayloadError> {
2027 let offset = usize::try_from(offset)
2028 .map_err(|_| UnidicArtifactPayloadError::InvalidIndexedOffset { offset })?;
2029 let start = readings_start.checked_add(offset).ok_or(
2030 UnidicArtifactPayloadError::InvalidIndexedOffset {
2031 offset: offset as u64,
2032 },
2033 )?;
2034 if start >= bytes.len() {
2035 return Err(UnidicArtifactPayloadError::InvalidIndexedOffset {
2036 offset: offset as u64,
2037 });
2038 }
2039 let mut cursor = start;
2040 let reading_count = read_u32_le_bytes(bytes, cursor, "reading_count")? as usize;
2041 check_limit(
2042 "reading_count",
2043 reading_count,
2044 MAX_ARTIFACT_READINGS_PER_ENTRY,
2045 )?;
2046 cursor += 4;
2047 let mut readings = Vec::with_capacity(reading_count);
2048 for _ in 0..reading_count {
2049 let len = read_u32_le_bytes(bytes, cursor, "reading_len")? as usize;
2050 check_limit("reading_bytes", len, MAX_ARTIFACT_STRING_BYTES)?;
2051 cursor += 4;
2052 let end = cursor
2053 .checked_add(len)
2054 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
2055 let reading_bytes = bytes
2056 .get(cursor..end)
2057 .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
2058 let reading = std::str::from_utf8(reading_bytes)
2059 .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
2060 field: "reading",
2061 source,
2062 })?
2063 .to_string();
2064 readings.push(reading);
2065 cursor = end;
2066 }
2067 Ok(readings)
2068}
2069
2070pub fn artifact_file_digest_path(path: impl AsRef<Path>) -> Result<String, std::io::Error> {
2072 let file = File::open(path)?;
2073 artifact_file_digest_reader(file)
2074}
2075
2076pub fn artifact_file_digest_reader(mut reader: impl Read) -> Result<String, std::io::Error> {
2078 let mut hasher = Sha256::new();
2079 let mut buffer = [0_u8; 64 * 1024];
2080 loop {
2081 let read = reader.read(&mut buffer)?;
2082 if read == 0 {
2083 break;
2084 }
2085 hasher.update(&buffer[..read]);
2086 }
2087 Ok(sha256_digest_hex(hasher.finalize()))
2088}
2089
2090fn validate_artifact_payload_header(
2091 payload: &UnidicReadingIndexPayload,
2092) -> Result<(), UnidicArtifactPayloadError> {
2093 if payload.schema_version != ARTIFACT_PAYLOAD_SCHEMA_VERSION {
2094 return Err(UnidicArtifactPayloadError::UnsupportedSchemaVersion {
2095 version: payload.schema_version,
2096 });
2097 }
2098 if payload.payload_type != ARTIFACT_PAYLOAD_TYPE {
2099 return Err(UnidicArtifactPayloadError::UnsupportedPayloadType {
2100 payload_type: payload.payload_type.clone(),
2101 });
2102 }
2103 Ok(())
2104}
2105
2106fn canonical_payload_bytes(payload: &UnidicReadingIndexPayload) -> Vec<u8> {
2107 let mut bytes = Vec::new();
2108 bytes.extend_from_slice(b"moine.unidic.reading-index.surface-readings/v1\n");
2109 for entry in &payload.entries {
2110 push_len_prefixed(&mut bytes, b"S", &entry.surface);
2111 bytes.extend_from_slice(format!("R{}\n", entry.readings.len()).as_bytes());
2112 for reading in &entry.readings {
2113 push_len_prefixed(&mut bytes, b"r", reading);
2114 }
2115 }
2116 bytes
2117}
2118
2119fn push_len_prefixed(bytes: &mut Vec<u8>, tag: &[u8], value: &str) {
2120 bytes.extend_from_slice(tag);
2121 bytes.extend_from_slice(value.len().to_string().as_bytes());
2122 bytes.push(b'\n');
2123 bytes.extend_from_slice(value.as_bytes());
2124 bytes.push(b'\n');
2125}
2126
2127fn fnv1a64(bytes: &[u8]) -> u64 {
2128 let mut hash = 0xcbf29ce484222325_u64;
2129 for byte in bytes {
2130 hash ^= u64::from(*byte);
2131 hash = hash.wrapping_mul(0x100000001b3);
2132 }
2133 hash
2134}
2135
2136fn sha256_hex(bytes: &[u8]) -> String {
2137 sha256_digest_hex(Sha256::digest(bytes))
2138}
2139
2140fn sha256_digest_hex(digest: impl IntoIterator<Item = u8>) -> String {
2141 let mut output = String::with_capacity(64);
2142 for byte in digest {
2143 write!(&mut output, "{byte:02x}").expect("writing to String should not fail");
2144 }
2145 output
2146}
2147
2148#[cfg(test)]
2149mod tests {
2150 use super::*;
2151
2152 #[test]
2153 fn builds_surface_to_readings_index() {
2154 let csv = "\
2155印刷,18331,19434,9138,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢,*,*,*,*,*,*,体,インサツ,インサツ,インサツ,インサツ,0,C2,*,752349454934528,2737
2156刃,18521,20041,11551,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和,ハ濁,基本形,*,*,*,*,体,ハ,ハ,ハ,ハ,1,C3,*,8060803244761600,29325
2157刃,18419,19578,12664,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和,*,*,*,*,*,*,体,ヤイバ,ヤイバ,ヤイバ,ヤイバ,\"1,0\",C1,*,18677687522566656,67949
2158";
2159 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2160
2161 assert_eq!(
2162 index.readings("印刷").as_deref(),
2163 Some(&["インサツ".to_string()][..])
2164 );
2165 assert_eq!(
2166 index.readings("刃").as_deref(),
2167 Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2168 );
2169 }
2170
2171 #[test]
2172 fn skips_star_readings() {
2173 let csv = "記号,1,2,3,補助記号,一般,*,*,*,*,*,記号,記号,*,記号,*,記号\n";
2174 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2175
2176 assert!(index.is_empty());
2177 }
2178
2179 #[test]
2180 fn excludes_ascii_and_symbol_surfaces_by_default() {
2181 let csv = "\
2182a,1,2,3,記号,文字,*,*,*,*,エー,a,a,エー,a,エー,外
2183!,1,2,3,補助記号,一般,*,*,*,*,!,!,!,!,!,!,記号
2184印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2185";
2186 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2187
2188 assert_eq!(index.readings("a"), None);
2189 assert_eq!(index.readings("!"), None);
2190 assert_eq!(
2191 index.readings("印刷").as_deref(),
2192 Some(&["インサツ".to_string()][..])
2193 );
2194 }
2195
2196 #[test]
2197 fn can_keep_ascii_surfaces_when_requested() {
2198 let csv = "a,1,2,3,名詞,普通名詞,一般,*,*,*,エー,a,a,エー,a,エー,外\n";
2199 let index = UnidicReadingIndex::from_lex_csv_reader_with_options(
2200 csv.as_bytes(),
2201 UnidicIndexOptions {
2202 exclude_ascii_surfaces: false,
2203 ..UnidicIndexOptions::default()
2204 },
2205 )
2206 .unwrap();
2207
2208 assert_eq!(
2209 index.readings("a").as_deref(),
2210 Some(&["エー".to_string()][..])
2211 );
2212 }
2213
2214 #[test]
2215 fn limits_readings_per_surface_when_requested() {
2216 let csv = "\
2217刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2218刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2219刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2220";
2221 let index = UnidicReadingIndex::from_lex_csv_reader_with_options(
2222 csv.as_bytes(),
2223 UnidicIndexOptions {
2224 max_readings_per_surface: Some(2),
2225 ..UnidicIndexOptions::default()
2226 },
2227 )
2228 .unwrap();
2229
2230 assert_eq!(
2231 index.readings("刃").as_deref(),
2232 Some(&["ジン".to_string(), "ハ".to_string()][..])
2233 );
2234 }
2235
2236 #[test]
2237 fn can_limit_readings_per_segment_at_query_time() {
2238 let csv = "\
2239刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2240刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2241刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2242";
2243 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2244 let readings = index.reading_sequences(
2245 "刃",
2246 DictionaryReadingOptions {
2247 max_readings_per_segment: Some(2),
2248 ..DictionaryReadingOptions::default()
2249 },
2250 );
2251
2252 assert_eq!(readings, vec!["ジン".to_string(), "ハ".to_string()]);
2253 }
2254
2255 #[test]
2256 fn builds_artifact_metadata_from_index_and_options() {
2257 let csv = "\
2258刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2259刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2260";
2261 let index_options = UnidicIndexOptions {
2262 reading_field: UnidicReadingField::Pron,
2263 max_readings_per_surface: Some(1),
2264 exclude_ascii_surfaces: true,
2265 exclude_symbol_pos: true,
2266 };
2267 let index =
2268 UnidicReadingIndex::from_lex_csv_reader_with_options(csv.as_bytes(), index_options)
2269 .unwrap();
2270
2271 let metadata = index.artifact_metadata(UnidicArtifactMetadataOptions {
2272 artifact_name: "moine-unidic-cwj-202512".to_string(),
2273 generator: "moine-cli".to_string(),
2274 payload_file_name: "moine-unidic-cwj-202512.readings.yaml".to_string(),
2275 payload_format: "yaml.surface-readings.v1".to_string(),
2276 source_name: "UniDic-CWJ".to_string(),
2277 source_version: "2025.12".to_string(),
2278 source_lex_csv: "unidic-cwj-202512_full/lex.csv".to_string(),
2279 index_options,
2280 query_defaults: DictionaryReadingOptions {
2281 longest_match_only: true,
2282 max_readings_per_segment: Some(16),
2283 ..DictionaryReadingOptions::default()
2284 },
2285 license: UnidicArtifactLicense::default(),
2286 });
2287
2288 assert_eq!(metadata.schema_version, 1);
2289 assert_eq!(metadata.artifact_type, "moine.unidic.reading-index");
2290 assert_eq!(
2291 metadata.payload.path,
2292 "moine-unidic-cwj-202512.readings.yaml"
2293 );
2294 assert_eq!(metadata.payload.format, "yaml.surface-readings.v1");
2295 assert_eq!(
2296 metadata.payload.checksum_algorithm,
2297 ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM
2298 );
2299 assert_eq!(metadata.payload.checksum.len(), 64);
2300 assert_eq!(metadata.source.version, "2025.12");
2301 assert_eq!(metadata.build.reading_field, "pron");
2302 assert_eq!(metadata.build.entries, 1);
2303 assert_eq!(metadata.build.max_readings_per_surface, Some(1));
2304 assert!(metadata.query_defaults.longest_match_only);
2305 assert_eq!(metadata.query_defaults.max_readings_per_segment, Some(16));
2306 assert_eq!(metadata.license.selected_license, "BSD-3-Clause");
2307 }
2308
2309 #[test]
2310 fn builds_deterministic_payload_entries() {
2311 let csv = "\
2312刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2313印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2314刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2315";
2316 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2317 let payload = index.artifact_payload();
2318
2319 assert_eq!(payload.schema_version, 1);
2320 assert_eq!(
2321 payload.payload_type,
2322 "moine.unidic.reading-index.surface-readings"
2323 );
2324 assert_eq!(
2325 payload.entries,
2326 vec![
2327 UnidicReadingIndexPayloadEntry {
2328 surface: "刃".to_string(),
2329 readings: vec!["ハ".to_string(), "ヤイバ".to_string()],
2330 },
2331 UnidicReadingIndexPayloadEntry {
2332 surface: "印刷".to_string(),
2333 readings: vec!["インサツ".to_string()],
2334 },
2335 ]
2336 );
2337 }
2338
2339 #[test]
2340 fn payload_checksum_changes_with_payload_content() {
2341 let first = UnidicReadingIndex::from_lex_csv_reader(
2342 "刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和\n".as_bytes(),
2343 )
2344 .unwrap();
2345 let second = UnidicReadingIndex::from_lex_csv_reader(
2346 "刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和\n".as_bytes(),
2347 )
2348 .unwrap();
2349
2350 assert_eq!(first.artifact_payload_checksum().len(), 64);
2351 assert_eq!(
2352 first.artifact_payload_checksum(),
2353 first
2354 .artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
2355 .unwrap()
2356 );
2357 assert_eq!(
2358 first
2359 .artifact_payload_checksum_for_algorithm(LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
2360 .unwrap()
2361 .len(),
2362 16
2363 );
2364 assert_ne!(
2365 first.artifact_payload_checksum(),
2366 second.artifact_payload_checksum()
2367 );
2368 }
2369
2370 #[test]
2371 fn loads_artifact_payload_back_into_index() {
2372 let payload = UnidicReadingIndexPayload {
2373 schema_version: 1,
2374 payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2375 entries: vec![UnidicReadingIndexPayloadEntry {
2376 surface: "印刷".to_string(),
2377 readings: vec!["インサツ".to_string()],
2378 }],
2379 };
2380
2381 let index = UnidicReadingIndex::from_artifact_payload(payload).unwrap();
2382
2383 assert_eq!(index.len(), 1);
2384 assert_eq!(
2385 index.readings("印刷").as_deref(),
2386 Some(&["インサツ".to_string()][..])
2387 );
2388 }
2389
2390 #[test]
2391 fn loads_artifact_payload_reader() {
2392 let yaml = "\
2393schema_version: 1
2394payload_type: moine.unidic.reading-index.surface-readings
2395entries:
2396- surface: 刃
2397 readings:
2398 - ハ
2399 - ヤイバ
2400";
2401
2402 let index = UnidicReadingIndex::from_artifact_payload_reader(yaml.as_bytes()).unwrap();
2403
2404 assert_eq!(
2405 index.readings("刃").as_deref(),
2406 Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2407 );
2408 }
2409
2410 #[test]
2411 fn binary_artifact_payload_round_trips_to_equivalent_index() {
2412 let csv = "\
2413刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2414刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2415印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2416";
2417 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2418 let mut bytes = Vec::new();
2419
2420 index.write_artifact_binary_payload(&mut bytes).unwrap();
2421 let loaded = UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice())
2422 .expect("binary payload should load");
2423 let header = UnidicReadingIndex::binary_artifact_payload_header_reader(bytes.as_slice())
2424 .expect("binary payload header should load");
2425
2426 assert_eq!(
2427 header,
2428 UnidicBinaryArtifactPayloadHeader {
2429 version: 1,
2430 entries: 2,
2431 }
2432 );
2433 assert_eq!(loaded.artifact_payload(), index.artifact_payload());
2434 assert_eq!(
2435 loaded.artifact_payload_checksum(),
2436 index.artifact_payload_checksum()
2437 );
2438 }
2439
2440 #[test]
2441 fn indexed_artifact_payload_round_trips_and_supports_lookup() {
2442 let csv = "\
2443刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2444刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2445印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2446";
2447 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2448 let mut bytes = Vec::new();
2449 index.write_indexed_artifact_payload(&mut bytes).unwrap();
2450
2451 let unique = std::time::SystemTime::now()
2452 .duration_since(std::time::UNIX_EPOCH)
2453 .unwrap()
2454 .as_nanos();
2455 let path = std::env::temp_dir().join(format!(
2456 "moine-indexed-test-{}-{}.moineidx",
2457 std::process::id(),
2458 unique
2459 ));
2460 std::fs::write(&path, &bytes).unwrap();
2461 let loaded = UnidicReadingIndex::from_indexed_artifact_payload_path(&path)
2462 .expect("indexed payload should load");
2463 let _ = std::fs::remove_file(&path);
2464 let loaded_from_bytes = UnidicReadingIndex::from_indexed_artifact_payload_bytes(&bytes)
2465 .expect("indexed payload bytes should load");
2466
2467 assert_eq!(loaded.len(), 2);
2468 assert_eq!(
2469 loaded.readings("刃").as_deref(),
2470 Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2471 );
2472 assert_eq!(
2473 loaded_from_bytes.artifact_payload(),
2474 index.artifact_payload()
2475 );
2476 assert_eq!(loaded.artifact_payload(), index.artifact_payload());
2477 assert_eq!(
2478 loaded.artifact_payload_checksum(),
2479 index.artifact_payload_checksum()
2480 );
2481 assert_eq!(
2482 loaded.reading_sequences("印刷", DictionaryReadingOptions::default()),
2483 vec!["インサツ".to_string()]
2484 );
2485 }
2486
2487 #[test]
2488 fn binary_artifact_payload_uses_stable_little_endian_layout() {
2489 let csv = "\
2490刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2491刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2492印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2493";
2494 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2495 let mut bytes = Vec::new();
2496
2497 index.write_artifact_binary_payload(&mut bytes).unwrap();
2498
2499 #[rustfmt::skip]
2500 let expected = vec![
2501 b'M', b'O', b'I', b'N', b'E', b'U', b'0', b'1',
2502 1, 0, 0, 0,
2503 0, 0, 0, 0,
2504 2, 0, 0, 0, 0, 0, 0, 0,
2505 3, 0, 0, 0, 0xe5, 0x88, 0x83,
2506 2, 0, 0, 0,
2507 3, 0, 0, 0, 0xe3, 0x83, 0x8f,
2508 9, 0, 0, 0, 0xe3, 0x83, 0xa4, 0xe3, 0x82, 0xa4, 0xe3, 0x83, 0x90,
2509 6, 0, 0, 0, 0xe5, 0x8d, 0xb0, 0xe5, 0x88, 0xb7,
2510 1, 0, 0, 0,
2511 12, 0, 0, 0, 0xe3, 0x82, 0xa4, 0xe3, 0x83, 0xb3, 0xe3, 0x82, 0xb5, 0xe3, 0x83, 0x84,
2512 ];
2513 assert_eq!(bytes, expected);
2514 }
2515
2516 #[test]
2517 fn rejects_binary_artifact_bad_magic() {
2518 let bytes = *b"NOTMOINE";
2519 let err =
2520 UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2521
2522 assert!(matches!(
2523 err,
2524 UnidicArtifactPayloadError::InvalidBinaryMagic { .. }
2525 ));
2526 }
2527
2528 #[test]
2529 fn rejects_binary_artifact_unsupported_version() {
2530 let mut bytes = Vec::new();
2531 bytes.extend_from_slice(b"MOINEU01");
2532 bytes.extend_from_slice(&2_u32.to_le_bytes());
2533 bytes.extend_from_slice(&0_u32.to_le_bytes());
2534 bytes.extend_from_slice(&0_u64.to_le_bytes());
2535
2536 let err =
2537 UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2538
2539 assert!(matches!(
2540 err,
2541 UnidicArtifactPayloadError::UnsupportedBinaryVersion { version: 2 }
2542 ));
2543 }
2544
2545 #[test]
2546 fn rejects_binary_artifact_truncated_string() {
2547 let mut bytes = Vec::new();
2548 bytes.extend_from_slice(b"MOINEU01");
2549 bytes.extend_from_slice(&1_u32.to_le_bytes());
2550 bytes.extend_from_slice(&0_u32.to_le_bytes());
2551 bytes.extend_from_slice(&1_u64.to_le_bytes());
2552 bytes.extend_from_slice(&4_u32.to_le_bytes());
2553 bytes.extend_from_slice("刃".as_bytes());
2554
2555 let err =
2556 UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2557
2558 assert!(matches!(
2559 err,
2560 UnidicArtifactPayloadError::TruncatedBinary { field: "surface" }
2561 ));
2562 }
2563
2564 #[test]
2565 fn rejects_binary_artifact_invalid_utf8() {
2566 let mut bytes = Vec::new();
2567 bytes.extend_from_slice(b"MOINEU01");
2568 bytes.extend_from_slice(&1_u32.to_le_bytes());
2569 bytes.extend_from_slice(&0_u32.to_le_bytes());
2570 bytes.extend_from_slice(&1_u64.to_le_bytes());
2571 bytes.extend_from_slice(&1_u32.to_le_bytes());
2572 bytes.push(0xff);
2573 bytes.extend_from_slice(&0_u32.to_le_bytes());
2574
2575 let err =
2576 UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2577
2578 assert!(matches!(
2579 err,
2580 UnidicArtifactPayloadError::InvalidBinaryUtf8 {
2581 field: "surface",
2582 ..
2583 }
2584 ));
2585 }
2586
2587 #[test]
2588 fn rejects_binary_artifact_excessive_entry_count() {
2589 let mut bytes = Vec::new();
2590 bytes.extend_from_slice(b"MOINEU01");
2591 bytes.extend_from_slice(&1_u32.to_le_bytes());
2592 bytes.extend_from_slice(&0_u32.to_le_bytes());
2593 bytes.extend_from_slice(&((MAX_ARTIFACT_ENTRIES as u64) + 1).to_le_bytes());
2594
2595 let err =
2596 UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2597
2598 assert!(matches!(
2599 err,
2600 UnidicArtifactPayloadError::ArtifactLimitExceeded {
2601 field: "entry_count",
2602 ..
2603 }
2604 ));
2605 }
2606
2607 #[test]
2608 fn rejects_artifact_payload_duplicate_surfaces() {
2609 let payload = UnidicReadingIndexPayload {
2610 schema_version: 1,
2611 payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2612 entries: vec![
2613 UnidicReadingIndexPayloadEntry {
2614 surface: "刃".to_string(),
2615 readings: vec!["ハ".to_string()],
2616 },
2617 UnidicReadingIndexPayloadEntry {
2618 surface: "刃".to_string(),
2619 readings: vec!["ヤイバ".to_string()],
2620 },
2621 ],
2622 };
2623
2624 let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2625
2626 assert!(matches!(
2627 err,
2628 UnidicArtifactPayloadError::DuplicateSurface { surface } if surface == "刃"
2629 ));
2630 }
2631
2632 #[test]
2633 fn rejects_artifact_payload_duplicate_readings() {
2634 let payload = UnidicReadingIndexPayload {
2635 schema_version: 1,
2636 payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2637 entries: vec![UnidicReadingIndexPayloadEntry {
2638 surface: "刃".to_string(),
2639 readings: vec!["ハ".to_string(), "ハ".to_string()],
2640 }],
2641 };
2642
2643 let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2644
2645 assert!(matches!(
2646 err,
2647 UnidicArtifactPayloadError::DuplicateReading { surface, reading }
2648 if surface == "刃" && reading == "ハ"
2649 ));
2650 }
2651
2652 #[test]
2653 fn rejects_artifact_payload_excessive_reading_count() {
2654 let payload = UnidicReadingIndexPayload {
2655 schema_version: 1,
2656 payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2657 entries: vec![UnidicReadingIndexPayloadEntry {
2658 surface: "刃".to_string(),
2659 readings: vec!["ハ".to_string(); MAX_ARTIFACT_READINGS_PER_ENTRY + 1],
2660 }],
2661 };
2662
2663 let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2664
2665 assert!(matches!(
2666 err,
2667 UnidicArtifactPayloadError::ArtifactLimitExceeded {
2668 field: "reading_count",
2669 ..
2670 }
2671 ));
2672 }
2673
2674 #[test]
2675 fn rejects_artifact_payload_schema_mismatch() {
2676 let payload = UnidicReadingIndexPayload {
2677 schema_version: 2,
2678 payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2679 entries: Vec::new(),
2680 };
2681
2682 let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2683
2684 assert!(matches!(
2685 err,
2686 UnidicArtifactPayloadError::UnsupportedSchemaVersion { version: 2 }
2687 ));
2688 }
2689
2690 #[test]
2691 fn reports_reading_expansion_stats() {
2692 let csv = "\
2693刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2694刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2695刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2696";
2697 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2698 let expansion = index.reading_paths_with_stats(
2699 "刃",
2700 DictionaryReadingOptions {
2701 max_readings_per_segment: Some(2),
2702 ..DictionaryReadingOptions::default()
2703 },
2704 );
2705
2706 assert_eq!(expansion.paths.len(), 2);
2707 assert_eq!(
2708 expansion.stats,
2709 DictionaryReadingStats {
2710 matched_spans: 1,
2711 direct_fallback_spans: 0,
2712 longest_match_pruned_spans: 0,
2713 raw_segment_readings: 3,
2714 used_segment_readings: 2,
2715 pruned_segment_readings: 1,
2716 candidate_combinations: 2,
2717 unique_paths: 2,
2718 duplicate_joined_readings: 0,
2719 max_paths_hit_count: 0,
2720 }
2721 );
2722 }
2723
2724 #[test]
2725 fn reports_longest_match_and_path_limit_stats() {
2726 let csv = "\
2727茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2728道,1,2,3,名詞,普通名詞,一般,*,*,*,ミチ,道,道,ミチ,道,ミチ,和
2729道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2730具,1,2,3,名詞,普通名詞,一般,*,*,*,グ,具,具,グ,具,グ,和
2731";
2732 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2733 let expansion = index.reading_paths_with_stats(
2734 "茶道具",
2735 DictionaryReadingOptions {
2736 longest_match_only: true,
2737 max_paths: 1,
2738 ..DictionaryReadingOptions::default()
2739 },
2740 );
2741
2742 assert_eq!(expansion.paths.len(), 1);
2743 assert!(expansion.stats.longest_match_pruned_spans > 0);
2744 assert!(expansion.stats.max_paths_hit_count > 0);
2745 }
2746
2747 #[test]
2748 fn hybrid_reading_paths_use_direct_fallback_for_kana_ascii_spans() {
2749 let csv = "\
2750印,1,2,3,名詞,普通名詞,一般,*,*,*,イン,印,印,イン,印,イン,漢
2751";
2752 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2753 let expansion =
2754 index.hybrid_reading_paths_with_stats("印さt", DictionaryReadingOptions::default());
2755
2756 assert_eq!(
2757 expansion.paths,
2758 vec![DictionaryReadingPath {
2759 joined_reading: "インさt".to_string(),
2760 segments: vec![
2761 DictionaryReadingSegment {
2762 surface: "印".to_string(),
2763 reading: "イン".to_string(),
2764 },
2765 DictionaryReadingSegment {
2766 surface: "さt".to_string(),
2767 reading: "さt".to_string(),
2768 },
2769 ],
2770 }]
2771 );
2772 assert_eq!(expansion.stats.direct_fallback_spans, 2);
2773 }
2774
2775 #[test]
2776 fn hybrid_reading_paths_keep_shorter_dictionary_spans_for_direct_tail() {
2777 let csv = "\
2778印,1,2,3,名詞,普通名詞,一般,*,*,*,イン,印,印,イン,印,イン,漢
2779印さ,1,2,3,動詞,一般,*,*,*,*,シルス,印す,印す,シルス,印す,シルス,和
2780";
2781 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2782 let expansion = index.hybrid_reading_paths_with_stats(
2783 "印さt",
2784 DictionaryReadingOptions {
2785 longest_match_only: true,
2786 ..DictionaryReadingOptions::default()
2787 },
2788 );
2789
2790 assert!(expansion
2791 .paths
2792 .iter()
2793 .any(|path| path.joined_reading == "インさt"));
2794 assert_eq!(expansion.stats.longest_match_pruned_spans, 0);
2795 }
2796
2797 #[test]
2798 fn hybrid_reading_paths_still_reject_uncovered_kanji() {
2799 let index = UnidicReadingIndex::default();
2800 let expansion =
2801 index.hybrid_reading_paths_with_stats("未知z", DictionaryReadingOptions::default());
2802
2803 assert!(expansion.paths.is_empty());
2804 assert_eq!(expansion.stats.direct_fallback_spans, 1);
2805 }
2806
2807 #[test]
2808 fn can_use_pron_instead_of_lform() {
2809 let csv = "\
2810刃,18521,20041,11551,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和,ハ濁,基本形,*,*,*,*,体,ハ,ハ,ハ,ハ,1,C3,*,8060803244761600,29325
2811刃,18521,20055,14836,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,バ,刃,バ,和,ハ濁,濁音形,*,*,*,*,体,バ,バ,バ,ハ,1,C3,*,8060803244769792,29325
2812";
2813 let index = UnidicReadingIndex::from_lex_csv_reader_with_field(
2814 csv.as_bytes(),
2815 UnidicReadingField::Pron,
2816 )
2817 .unwrap();
2818
2819 assert_eq!(
2820 index.readings("刃").as_deref(),
2821 Some(&["ハ".to_string(), "バ".to_string()][..])
2822 );
2823 }
2824
2825 #[test]
2826 fn builds_reading_sequences_from_dictionary_segments() {
2827 let csv = "\
2828鬼滅,1,2,3,名詞,普通名詞,一般,*,*,*,キメツ,鬼滅,鬼滅,キメツ,鬼滅,キメツ,固
2829の,1,2,3,助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和
2830刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2831刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2832";
2833 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2834 let readings = index.reading_sequences("鬼滅の刃", DictionaryReadingOptions::default());
2835
2836 assert_eq!(
2837 readings,
2838 vec!["キメツノハ".to_string(), "キメツノヤイバ".to_string()]
2839 );
2840 }
2841
2842 #[test]
2843 fn reading_paths_keep_segmentation_and_segment_readings() {
2844 let csv = "\
2845茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2846道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,漢
2847";
2848 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2849 let paths = index.reading_paths(
2850 "茶道具",
2851 DictionaryReadingOptions {
2852 longest_match_only: true,
2853 ..DictionaryReadingOptions::default()
2854 },
2855 );
2856
2857 assert_eq!(
2858 paths,
2859 vec![DictionaryReadingPath {
2860 joined_reading: "チャドウグ".to_string(),
2861 segments: vec![
2862 DictionaryReadingSegment {
2863 surface: "茶".to_string(),
2864 reading: "チャ".to_string(),
2865 },
2866 DictionaryReadingSegment {
2867 surface: "道具".to_string(),
2868 reading: "ドウグ".to_string(),
2869 },
2870 ],
2871 }]
2872 );
2873 }
2874
2875 #[test]
2876 fn builds_romaji_lattice_from_dictionary_segments() {
2877 let csv = "\
2878茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2879道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2880";
2881 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2882 let lattice = index
2883 .romaji_lattice("茶道具", DictionaryReadingOptions::default())
2884 .unwrap()
2885 .unwrap();
2886
2887 assert_eq!(
2888 moine_core::distance(&lattice, &Lattice::from_paths(["chadougu"])),
2889 0
2890 );
2891 }
2892
2893 #[test]
2894 fn builds_romaji_lattice_directly_from_reading_paths() {
2895 let paths = vec![
2896 DictionaryReadingPath {
2897 joined_reading: "チャドウグ".to_string(),
2898 segments: vec![
2899 DictionaryReadingSegment {
2900 surface: "茶".to_string(),
2901 reading: "チャ".to_string(),
2902 },
2903 DictionaryReadingSegment {
2904 surface: "道具".to_string(),
2905 reading: "ドウグ".to_string(),
2906 },
2907 ],
2908 },
2909 DictionaryReadingPath {
2910 joined_reading: "チャドーグ".to_string(),
2911 segments: vec![
2912 DictionaryReadingSegment {
2913 surface: "茶".to_string(),
2914 reading: "チャ".to_string(),
2915 },
2916 DictionaryReadingSegment {
2917 surface: "道具".to_string(),
2918 reading: "ドーグ".to_string(),
2919 },
2920 ],
2921 },
2922 ];
2923 let lattice = romaji_lattice_from_reading_paths(&paths).unwrap();
2924
2925 assert_eq!(
2926 moine_core::distance(&lattice, &Lattice::from_paths(["chadougu"])),
2927 0
2928 );
2929 assert_eq!(
2930 moine_core::distance(&lattice, &Lattice::from_paths(["chadoogu"])),
2931 0
2932 );
2933 }
2934
2935 #[test]
2936 fn structured_reading_paths_keep_cross_segment_context() {
2937 let paths = vec![DictionaryReadingPath {
2938 joined_reading: "マッチャ".to_string(),
2939 segments: vec![
2940 DictionaryReadingSegment {
2941 surface: "抹".to_string(),
2942 reading: "マッ".to_string(),
2943 },
2944 DictionaryReadingSegment {
2945 surface: "茶".to_string(),
2946 reading: "チャ".to_string(),
2947 },
2948 ],
2949 }];
2950 let lattice = romaji_lattice_from_reading_paths(&paths).unwrap();
2951
2952 assert_eq!(
2953 moine_core::distance(&lattice, &Lattice::from_paths(["maccha"])),
2954 0
2955 );
2956 assert_eq!(
2957 moine_core::distance(&lattice, &Lattice::from_paths(["mattya"])),
2958 0
2959 );
2960 }
2961
2962 #[test]
2963 fn can_restrict_reading_sequences_to_longest_matches() {
2964 let csv = "\
2965茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2966道,1,2,3,名詞,普通名詞,一般,*,*,*,ミチ,道,道,ミチ,道,ミチ,和
2967道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2968具,1,2,3,名詞,普通名詞,一般,*,*,*,グ,具,具,グ,具,グ,和
2969";
2970 let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2971 let readings = index.reading_sequences(
2972 "茶道具",
2973 DictionaryReadingOptions {
2974 longest_match_only: true,
2975 ..DictionaryReadingOptions::default()
2976 },
2977 );
2978
2979 assert_eq!(readings, vec!["チャドウグ".to_string()]);
2980 }
2981}