Skip to main content

moine_ja/
unidic.rs

1use std::borrow::Cow;
2use std::collections::{btree_map::Entry, BTreeSet, HashMap};
3use std::error::Error;
4use std::fmt;
5use std::fmt::Write as _;
6use std::fs::File;
7use std::io::{Read, Write};
8use std::path::Path;
9use std::string::FromUtf8Error;
10use std::sync::Arc;
11
12use fst::{Map, MapBuilder, Streamer};
13use memmap2::Mmap;
14use moine_core::Lattice;
15use serde::{Deserialize, Serialize};
16use sha2::{Digest, Sha256};
17
18use crate::romaji::{
19    can_build_romaji_paths, romaji_paths_from_reading_segments,
20    romaji_symbol_paths_from_reading_segments, JaLatticeError,
21};
22
23const SURFACE_COLUMN: usize = 0;
24const POS1_COLUMN: usize = 4;
25const LFORM_COLUMN: usize = 10;
26const PRON_COLUMN: usize = 13;
27const ARTIFACT_PAYLOAD_SCHEMA_VERSION: u32 = 1;
28const ARTIFACT_PAYLOAD_TYPE: &str = "moine.unidic.reading-index.surface-readings";
29const BINARY_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEU01";
30const BINARY_ARTIFACT_VERSION: u32 = 1;
31const INDEXED_ARTIFACT_MAGIC: &[u8; 8] = b"MOINEI01";
32const INDEXED_ARTIFACT_VERSION: u32 = 1;
33const INDEXED_ARTIFACT_HEADER_LEN: usize = 40;
34const MAX_ARTIFACT_PAYLOAD_BYTES: u64 = 512 * 1024 * 1024;
35const MAX_ARTIFACT_ENTRIES: usize = 2_000_000;
36const MAX_ARTIFACT_READINGS_PER_ENTRY: usize = 256;
37const MAX_ARTIFACT_STRING_BYTES: usize = 16 * 1024;
38/// Current canonical checksum algorithm for normalized UniDic payload content.
39pub const ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "sha256-canonical-v1";
40/// Legacy canonical checksum algorithm accepted for older UniDic artifacts.
41pub const LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM: &str = "fnv1a64-canonical-v1";
42/// File digest algorithm used to verify payload bytes before loading.
43pub const ARTIFACT_PAYLOAD_FILE_DIGEST_ALGORITHM: &str = "sha256-file-v1";
44
45/// UniDic-derived surface-to-reading index.
46#[derive(Clone, Debug)]
47pub struct UnidicReadingIndex {
48    storage: UnidicReadingStorage,
49}
50
51#[derive(Clone, Debug)]
52enum UnidicReadingStorage {
53    Eager(HashMap<String, Vec<String>>),
54    Indexed(IndexedUnidicPayload),
55}
56
57impl Default for UnidicReadingIndex {
58    fn default() -> Self {
59        Self {
60            storage: UnidicReadingStorage::Eager(HashMap::new()),
61        }
62    }
63}
64
65impl PartialEq for UnidicReadingIndex {
66    fn eq(&self, other: &Self) -> bool {
67        self.artifact_payload() == other.artifact_payload()
68    }
69}
70
71impl Eq for UnidicReadingIndex {}
72
73#[derive(Clone, Debug)]
74struct IndexedUnidicPayload {
75    mmap: Arc<Mmap>,
76    map: Map<Vec<u8>>,
77    readings_start: usize,
78    entries: usize,
79}
80
81/// Header for indexed FST UniDic payloads.
82#[derive(Clone, Copy, Debug, Eq, PartialEq)]
83pub struct UnidicIndexedArtifactPayloadHeader {
84    /// Indexed payload format version.
85    pub version: u32,
86    /// Number of entries in the payload.
87    pub entries: usize,
88    /// Length of the embedded FST section in bytes.
89    pub fst_len: usize,
90    /// Length of the reading blob section in bytes.
91    pub readings_len: usize,
92}
93
94/// Header for legacy binary UniDic payloads.
95#[derive(Clone, Copy, Debug, Eq, PartialEq)]
96pub struct UnidicBinaryArtifactPayloadHeader {
97    /// Binary payload format version.
98    pub version: u32,
99    /// Number of entries in the payload.
100    pub entries: usize,
101}
102
103/// Controls dictionary reading-path expansion.
104#[derive(Clone, Copy, Debug, Eq, PartialEq)]
105pub struct DictionaryReadingOptions {
106    /// Maximum surface span length considered for one dictionary segment.
107    pub max_span_chars: usize,
108    /// Maximum complete reading paths to keep.
109    pub max_paths: usize,
110    /// Prefer the longest dictionary span when multiple spans start together.
111    pub longest_match_only: bool,
112    /// Optional cap on readings used per dictionary segment.
113    pub max_readings_per_segment: Option<usize>,
114}
115
116/// One surface segment and its selected UniDic reading.
117#[derive(Clone, Debug, Eq, PartialEq)]
118pub struct DictionaryReadingSegment {
119    /// Surface text covered by the segment.
120    pub surface: String,
121    /// Reading selected for the segment.
122    pub reading: String,
123}
124
125/// One complete segmentation and joined reading for an input string.
126#[derive(Clone, Debug, Eq, PartialEq)]
127pub struct DictionaryReadingPath {
128    /// Ordered dictionary/direct segments in the path.
129    pub segments: Vec<DictionaryReadingSegment>,
130    /// Segment readings concatenated into one reading string.
131    pub joined_reading: String,
132}
133
134/// Reading-path expansion result plus pruning statistics.
135#[derive(Clone, Debug, Default, Eq, PartialEq)]
136pub struct DictionaryReadingExpansion {
137    /// Expanded reading paths.
138    pub paths: Vec<DictionaryReadingPath>,
139    /// Statistics gathered during expansion.
140    pub stats: DictionaryReadingStats,
141}
142
143/// Counters describing dictionary reading-path expansion.
144#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
145pub struct DictionaryReadingStats {
146    /// Dictionary spans matched during expansion.
147    pub matched_spans: usize,
148    /// Direct fallback spans used when no dictionary span matched.
149    pub direct_fallback_spans: usize,
150    /// Candidate spans pruned by longest-match mode.
151    pub longest_match_pruned_spans: usize,
152    /// Raw readings seen before per-segment pruning.
153    pub raw_segment_readings: usize,
154    /// Readings retained after per-segment pruning.
155    pub used_segment_readings: usize,
156    /// Readings removed by per-segment pruning.
157    pub pruned_segment_readings: usize,
158    /// Candidate path combinations considered.
159    pub candidate_combinations: usize,
160    /// Unique complete reading paths retained.
161    pub unique_paths: usize,
162    /// Duplicate joined readings removed.
163    pub duplicate_joined_readings: usize,
164    /// Number of times the `max_paths` cap was hit.
165    pub max_paths_hit_count: usize,
166}
167
168/// Builds a compact romaji lattice from dictionary reading paths.
169pub fn romaji_lattice_from_reading_paths(
170    paths: &[DictionaryReadingPath],
171) -> Result<Lattice, JaLatticeError> {
172    if paths.is_empty() {
173        return Err(JaLatticeError::EmptyReadings);
174    }
175
176    let paths = romaji_symbol_paths_from_reading_segments(
177        paths
178            .iter()
179            .map(|path| path.segments.iter().map(|segment| segment.reading.as_str())),
180    )?;
181    Ok(Lattice::from_symbol_paths_compact(paths))
182}
183
184/// Expands dictionary reading paths into explicit romaji strings.
185pub fn romaji_paths_from_reading_paths(
186    paths: &[DictionaryReadingPath],
187) -> Result<Vec<String>, JaLatticeError> {
188    if paths.is_empty() {
189        return Err(JaLatticeError::EmptyReadings);
190    }
191
192    romaji_paths_from_reading_segments(
193        paths
194            .iter()
195            .map(|path| path.segments.iter().map(|segment| segment.reading.as_str())),
196    )
197}
198
199/// UniDic CSV field used as the source reading.
200#[derive(Clone, Copy, Debug, Eq, PartialEq)]
201pub enum UnidicReadingField {
202    /// Lemma-form reading column.
203    LForm,
204    /// Pronunciation column.
205    Pron,
206}
207
208/// Metadata stored in a UniDic dictionary bundle.
209#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
210pub struct UnidicArtifactMetadata {
211    /// Metadata schema version.
212    pub schema_version: u32,
213    /// Artifact type identifier.
214    pub artifact_type: String,
215    /// Human-readable artifact name.
216    pub artifact_name: String,
217    /// Tool or command that generated the artifact.
218    pub generator: String,
219    /// Payload file metadata.
220    pub payload: UnidicArtifactPayload,
221    /// Source dictionary metadata.
222    pub source: UnidicArtifactSource,
223    /// Build-time options and counts.
224    pub build: UnidicArtifactBuild,
225    /// Default query options for this artifact.
226    pub query_defaults: UnidicArtifactQueryDefaults,
227    /// License metadata and references.
228    pub license: UnidicArtifactLicense,
229}
230
231/// Payload file metadata for a UniDic dictionary bundle.
232#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
233pub struct UnidicArtifactPayload {
234    /// Bundle-relative payload file path.
235    pub path: String,
236    /// Payload serialization format.
237    pub format: String,
238    /// Optional digest algorithm for the raw payload file.
239    #[serde(default, skip_serializing_if = "Option::is_none")]
240    pub file_digest_algorithm: Option<String>,
241    /// Optional digest of the raw payload file.
242    #[serde(default, skip_serializing_if = "Option::is_none")]
243    pub file_digest: Option<String>,
244    /// Canonical payload checksum algorithm.
245    pub checksum_algorithm: String,
246    /// Canonical payload checksum.
247    pub checksum: String,
248}
249
250/// Source dictionary metadata for a UniDic artifact.
251#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
252pub struct UnidicArtifactSource {
253    /// Source dictionary name.
254    pub name: String,
255    /// Source dictionary version.
256    pub version: String,
257    /// Source `lex.csv` path used to build the artifact.
258    pub lex_csv: String,
259}
260
261/// Build settings and counts recorded in UniDic artifact metadata.
262#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
263pub struct UnidicArtifactBuild {
264    /// UniDic reading field used for entries.
265    pub reading_field: String,
266    /// Optional cap applied to readings stored per surface.
267    pub max_readings_per_surface: Option<usize>,
268    /// Whether ASCII-only surfaces were excluded.
269    pub exclude_ascii_surfaces: bool,
270    /// Whether symbol part-of-speech entries were excluded.
271    pub exclude_symbol_pos: bool,
272    /// Number of entries in the generated payload.
273    pub entries: usize,
274}
275
276/// Default reading-path query settings stored in an artifact.
277#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq, Serialize)]
278pub struct UnidicArtifactQueryDefaults {
279    /// Maximum surface span length considered for one segment.
280    pub max_span_chars: usize,
281    /// Maximum complete reading paths to keep.
282    pub max_paths: usize,
283    /// Whether longest-match-only expansion should be used by default.
284    pub longest_match_only: bool,
285    /// Optional cap on readings used per segment.
286    pub max_readings_per_segment: Option<usize>,
287}
288
289/// License metadata for a UniDic-derived artifact.
290#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
291pub struct UnidicArtifactLicense {
292    /// Selected license label for the artifact.
293    pub selected_license: String,
294    /// Bundle-relative license or notice files.
295    pub references: Vec<UnidicArtifactLicenseReference>,
296}
297
298/// One license or notice file referenced by artifact metadata.
299#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
300pub struct UnidicArtifactLicenseReference {
301    /// Human-readable reference label.
302    pub label: String,
303    /// Bundle-relative file path.
304    pub path: String,
305}
306
307/// Portable YAML representation of a UniDic reading index.
308#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
309pub struct UnidicReadingIndexPayload {
310    /// Payload schema version.
311    pub schema_version: u32,
312    /// Payload type identifier.
313    pub payload_type: String,
314    /// Surface entries and readings.
315    pub entries: Vec<UnidicReadingIndexPayloadEntry>,
316}
317
318/// One surface entry in a UniDic reading-index payload.
319#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
320pub struct UnidicReadingIndexPayloadEntry {
321    /// Surface form.
322    pub surface: String,
323    /// Readings associated with the surface form.
324    pub readings: Vec<String>,
325}
326
327/// Inputs used to generate artifact metadata for an index.
328#[derive(Clone, Debug, Eq, PartialEq)]
329pub struct UnidicArtifactMetadataOptions {
330    /// Human-readable artifact name.
331    pub artifact_name: String,
332    /// Tool or command that generated the artifact.
333    pub generator: String,
334    /// Bundle-relative payload file name.
335    pub payload_file_name: String,
336    /// Payload serialization format.
337    pub payload_format: String,
338    /// Source dictionary name.
339    pub source_name: String,
340    /// Source dictionary version.
341    pub source_version: String,
342    /// Source `lex.csv` path.
343    pub source_lex_csv: String,
344    /// Index build settings.
345    pub index_options: UnidicIndexOptions,
346    /// Default query settings.
347    pub query_defaults: DictionaryReadingOptions,
348    /// License metadata and references.
349    pub license: UnidicArtifactLicense,
350}
351
352/// Options used while building a UniDic reading index.
353#[derive(Clone, Copy, Debug, Eq, PartialEq)]
354pub struct UnidicIndexOptions {
355    /// UniDic CSV field used as the source reading.
356    pub reading_field: UnidicReadingField,
357    /// Optional cap on readings stored for each surface form.
358    pub max_readings_per_surface: Option<usize>,
359    /// Exclude ASCII-only dictionary surfaces.
360    pub exclude_ascii_surfaces: bool,
361    /// Exclude entries whose coarse part of speech is a symbol.
362    pub exclude_symbol_pos: bool,
363}
364
365impl UnidicReadingField {
366    fn column(self) -> usize {
367        match self {
368            Self::LForm => LFORM_COLUMN,
369            Self::Pron => PRON_COLUMN,
370        }
371    }
372
373    /// Returns the stable artifact string for this reading field.
374    pub fn as_str(self) -> &'static str {
375        match self {
376            Self::LForm => "lform",
377            Self::Pron => "pron",
378        }
379    }
380}
381
382impl Default for UnidicIndexOptions {
383    fn default() -> Self {
384        Self {
385            reading_field: UnidicReadingField::LForm,
386            max_readings_per_surface: None,
387            exclude_ascii_surfaces: true,
388            exclude_symbol_pos: true,
389        }
390    }
391}
392
393impl Default for DictionaryReadingOptions {
394    fn default() -> Self {
395        Self {
396            max_span_chars: 8,
397            max_paths: 1024,
398            longest_match_only: false,
399            max_readings_per_segment: None,
400        }
401    }
402}
403
404impl Default for UnidicArtifactLicense {
405    fn default() -> Self {
406        Self {
407            selected_license: "BSD-3-Clause".to_string(),
408            references: vec![
409                UnidicArtifactLicenseReference {
410                    label: "BSD".to_string(),
411                    path: "license/BSD".to_string(),
412                },
413                UnidicArtifactLicenseReference {
414                    label: "COPYING".to_string(),
415                    path: "license/COPYING".to_string(),
416                },
417            ],
418        }
419    }
420}
421
422/// Errors returned while reading UniDic CSV resources.
423#[derive(Debug)]
424pub enum UnidicCsvError {
425    /// CSV parser error.
426    Csv(csv::Error),
427    /// Filesystem or reader error.
428    Io(std::io::Error),
429    /// A required CSV column was missing.
430    MissingColumn {
431        /// Zero-based record index.
432        record_index: u64,
433        /// Required column index.
434        column: usize,
435        /// Number of columns in the record.
436        len: usize,
437    },
438}
439
440/// Errors returned while reading or validating UniDic artifact payloads.
441#[derive(Debug)]
442pub enum UnidicArtifactPayloadError {
443    /// Filesystem or reader error.
444    Io(std::io::Error),
445    /// YAML parser error.
446    Yaml(serde_yaml::Error),
447    /// Binary payload magic did not match the expected value.
448    InvalidBinaryMagic {
449        /// Magic bytes read from the payload.
450        magic: [u8; 8],
451    },
452    /// Binary payload version is not supported.
453    UnsupportedBinaryVersion {
454        /// Version read from the payload.
455        version: u32,
456    },
457    /// Reserved binary header field was non-zero.
458    NonZeroBinaryReserved {
459        /// Reserved value read from the payload.
460        value: u32,
461    },
462    /// Binary payload ended before a field could be read.
463    TruncatedBinary {
464        /// Field being read.
465        field: &'static str,
466    },
467    /// Binary payload contained invalid UTF-8.
468    InvalidBinaryUtf8 {
469        /// Field being decoded.
470        field: &'static str,
471        /// UTF-8 conversion error.
472        source: FromUtf8Error,
473    },
474    /// Binary field length exceeded supported bounds.
475    BinaryValueTooLarge {
476        /// Field being read.
477        field: &'static str,
478        /// Field length.
479        len: usize,
480    },
481    /// Binary payload entry count exceeded supported bounds.
482    BinaryEntryCountTooLarge {
483        /// Entry count read from the payload.
484        entries: u64,
485    },
486    /// Artifact payload exceeded a configured safety limit.
487    ArtifactLimitExceeded {
488        /// Field whose length or count exceeded the limit.
489        field: &'static str,
490        /// Observed length or count.
491        len: u64,
492        /// Maximum allowed length or count.
493        max: u64,
494    },
495    /// Indexed payload magic did not match the expected value.
496    InvalidIndexedMagic {
497        /// Magic bytes read from the payload.
498        magic: [u8; 8],
499    },
500    /// Indexed payload version is not supported.
501    UnsupportedIndexedVersion {
502        /// Version read from the payload.
503        version: u32,
504    },
505    /// Reserved indexed header field was non-zero.
506    NonZeroIndexedReserved {
507        /// Reserved value read from the payload.
508        value: u32,
509    },
510    /// Indexed payload ended before a section could be read.
511    TruncatedIndexed {
512        /// Field or section being read.
513        field: &'static str,
514    },
515    /// Indexed payload contained an invalid FST section.
516    InvalidIndexedFst {
517        /// FST error message.
518        message: String,
519    },
520    /// Indexed payload section length exceeded supported bounds.
521    IndexedSectionTooLarge {
522        /// Section name.
523        field: &'static str,
524        /// Section length.
525        len: u64,
526    },
527    /// Indexed payload referenced an invalid readings offset.
528    InvalidIndexedOffset {
529        /// Offset read from the FST value.
530        offset: u64,
531    },
532    /// Indexed payload contained invalid UTF-8.
533    InvalidIndexedUtf8 {
534        /// Field being decoded.
535        field: &'static str,
536        /// UTF-8 conversion error.
537        source: std::str::Utf8Error,
538    },
539    /// Indexed header entry count disagreed with the FST entry count.
540    IndexedEntryCountMismatch {
541        /// Entry count recorded in the header.
542        header_entries: usize,
543        /// Entry count decoded from the FST.
544        fst_entries: usize,
545    },
546    /// YAML payload schema version is not supported.
547    UnsupportedSchemaVersion {
548        /// Version read from the payload.
549        version: u32,
550    },
551    /// YAML payload type is not a UniDic reading index.
552    UnsupportedPayloadType {
553        /// Payload type read from the payload.
554        payload_type: String,
555    },
556    /// Payload entry had an empty surface form.
557    EmptySurface {
558        /// Zero-based entry index.
559        entry_index: usize,
560    },
561    /// Surface form appeared more than once.
562    DuplicateSurface {
563        /// Duplicated surface form.
564        surface: String,
565    },
566    /// Payload entry had no readings.
567    EmptyReadings {
568        /// Surface form for the invalid entry.
569        surface: String,
570    },
571    /// Payload entry contained an empty reading.
572    EmptyReading {
573        /// Surface form for the invalid entry.
574        surface: String,
575        /// Zero-based reading index.
576        reading_index: usize,
577    },
578    /// Payload entry contained the same reading more than once.
579    DuplicateReading {
580        /// Surface form for the invalid entry.
581        surface: String,
582        /// Duplicated reading.
583        reading: String,
584    },
585}
586
587impl fmt::Display for UnidicCsvError {
588    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
589        match self {
590            Self::Csv(err) => write!(f, "invalid UniDic CSV: {err}"),
591            Self::Io(err) => write!(f, "failed to read UniDic CSV: {err}"),
592            Self::MissingColumn {
593                record_index,
594                column,
595                len,
596            } => write!(
597                f,
598                "UniDic CSV record {record_index} has no column {column}; record has {len} columns"
599            ),
600        }
601    }
602}
603
604impl Error for UnidicCsvError {}
605
606impl fmt::Display for UnidicArtifactPayloadError {
607    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
608        match self {
609            Self::Io(err) => write!(f, "failed to read UniDic artifact payload: {err}"),
610            Self::Yaml(err) => write!(f, "invalid UniDic artifact payload YAML: {err}"),
611            Self::InvalidBinaryMagic { magic } => {
612                write!(f, "invalid UniDic binary artifact magic {magic:?}")
613            }
614            Self::UnsupportedBinaryVersion { version } => {
615                write!(f, "unsupported UniDic binary artifact version {version}")
616            }
617            Self::NonZeroBinaryReserved { value } => {
618                write!(f, "UniDic binary artifact reserved header field is {value}")
619            }
620            Self::TruncatedBinary { field } => {
621                write!(f, "truncated UniDic binary artifact while reading {field}")
622            }
623            Self::InvalidBinaryUtf8 { field, source } => {
624                write!(f, "invalid UTF-8 in UniDic binary artifact {field}: {source}")
625            }
626            Self::BinaryValueTooLarge { field, len } => write!(
627                f,
628                "UniDic binary artifact {field} length {len} exceeds u32::MAX"
629            ),
630            Self::BinaryEntryCountTooLarge { entries } => write!(
631                f,
632                "UniDic binary artifact entry count {entries} exceeds usize::MAX"
633            ),
634            Self::ArtifactLimitExceeded { field, len, max } => write!(
635                f,
636                "UniDic artifact {field} length/count {len} exceeds limit {max}"
637            ),
638            Self::InvalidIndexedMagic { magic } => {
639                write!(f, "invalid UniDic indexed artifact magic {magic:?}")
640            }
641            Self::UnsupportedIndexedVersion { version } => {
642                write!(f, "unsupported UniDic indexed artifact version {version}")
643            }
644            Self::NonZeroIndexedReserved { value } => {
645                write!(f, "UniDic indexed artifact reserved header field is {value}")
646            }
647            Self::TruncatedIndexed { field } => {
648                write!(f, "truncated UniDic indexed artifact while reading {field}")
649            }
650            Self::InvalidIndexedFst { message } => {
651                write!(f, "invalid UniDic indexed artifact FST: {message}")
652            }
653            Self::IndexedSectionTooLarge { field, len } => write!(
654                f,
655                "UniDic indexed artifact {field} length {len} exceeds usize::MAX"
656            ),
657            Self::InvalidIndexedOffset { offset } => {
658                write!(f, "invalid UniDic indexed artifact readings offset {offset}")
659            }
660            Self::InvalidIndexedUtf8 { field, source } => {
661                write!(f, "invalid UTF-8 in UniDic indexed artifact {field}: {source}")
662            }
663            Self::IndexedEntryCountMismatch {
664                header_entries,
665                fst_entries,
666            } => write!(
667                f,
668                "UniDic indexed artifact header entry count {header_entries} does not match FST entry count {fst_entries}"
669            ),
670            Self::UnsupportedSchemaVersion { version } => write!(
671                f,
672                "unsupported UniDic artifact payload schema version {version}"
673            ),
674            Self::UnsupportedPayloadType { payload_type } => {
675                write!(f, "unsupported UniDic artifact payload type {payload_type:?}")
676            }
677            Self::EmptySurface { entry_index } => write!(
678                f,
679                "UniDic artifact payload entry {entry_index} has an empty surface"
680            ),
681            Self::DuplicateSurface { surface } => {
682                write!(f, "UniDic artifact payload has duplicate surface {surface:?}")
683            }
684            Self::EmptyReadings { surface } => write!(
685                f,
686                "UniDic artifact payload surface {surface:?} has no readings"
687            ),
688            Self::EmptyReading {
689                surface,
690                reading_index,
691            } => write!(
692                f,
693                "UniDic artifact payload surface {surface:?} has an empty reading at index {reading_index}"
694            ),
695            Self::DuplicateReading { surface, reading } => write!(
696                f,
697                "UniDic artifact payload surface {surface:?} has duplicate reading {reading:?}"
698            ),
699        }
700    }
701}
702
703impl Error for UnidicArtifactPayloadError {
704    fn source(&self) -> Option<&(dyn Error + 'static)> {
705        match self {
706            Self::Io(err) => Some(err),
707            Self::Yaml(err) => Some(err),
708            Self::InvalidBinaryUtf8 { source, .. } => Some(source),
709            Self::InvalidIndexedUtf8 { source, .. } => Some(source),
710            _ => None,
711        }
712    }
713}
714
715impl From<csv::Error> for UnidicCsvError {
716    fn from(err: csv::Error) -> Self {
717        Self::Csv(err)
718    }
719}
720
721impl From<std::io::Error> for UnidicCsvError {
722    fn from(err: std::io::Error) -> Self {
723        Self::Io(err)
724    }
725}
726
727impl From<std::io::Error> for UnidicArtifactPayloadError {
728    fn from(err: std::io::Error) -> Self {
729        Self::Io(err)
730    }
731}
732
733impl From<serde_yaml::Error> for UnidicArtifactPayloadError {
734    fn from(err: serde_yaml::Error) -> Self {
735        Self::Yaml(err)
736    }
737}
738
739impl UnidicReadingIndex {
740    /// Builds an index from a UniDic `lex.csv` file.
741    pub fn from_lex_csv_path(path: impl AsRef<Path>) -> Result<Self, UnidicCsvError> {
742        Self::from_lex_csv_path_with_options(path, UnidicIndexOptions::default())
743    }
744
745    /// Builds an index from a UniDic `lex.csv` file using a specific reading field.
746    pub fn from_lex_csv_path_with_field(
747        path: impl AsRef<Path>,
748        field: UnidicReadingField,
749    ) -> Result<Self, UnidicCsvError> {
750        Self::from_lex_csv_path_with_options(
751            path,
752            UnidicIndexOptions {
753                reading_field: field,
754                ..UnidicIndexOptions::default()
755            },
756        )
757    }
758
759    /// Builds an index from a UniDic `lex.csv` file with custom options.
760    pub fn from_lex_csv_path_with_options(
761        path: impl AsRef<Path>,
762        options: UnidicIndexOptions,
763    ) -> Result<Self, UnidicCsvError> {
764        let file = File::open(path)?;
765        Self::from_lex_csv_reader_with_options(file, options)
766    }
767
768    /// Builds an index from a reader containing UniDic `lex.csv` data.
769    pub fn from_lex_csv_reader(reader: impl Read) -> Result<Self, UnidicCsvError> {
770        Self::from_lex_csv_reader_with_options(reader, UnidicIndexOptions::default())
771    }
772
773    /// Builds an index from a UniDic `lex.csv` reader using a specific reading field.
774    pub fn from_lex_csv_reader_with_field(
775        reader: impl Read,
776        reading_field: UnidicReadingField,
777    ) -> Result<Self, UnidicCsvError> {
778        Self::from_lex_csv_reader_with_options(
779            reader,
780            UnidicIndexOptions {
781                reading_field,
782                ..UnidicIndexOptions::default()
783            },
784        )
785    }
786
787    /// Builds an index from a UniDic `lex.csv` reader with custom options.
788    pub fn from_lex_csv_reader_with_options(
789        reader: impl Read,
790        options: UnidicIndexOptions,
791    ) -> Result<Self, UnidicCsvError> {
792        let mut by_surface = HashMap::<String, BTreeSet<String>>::new();
793        for record in lex_csv_reader(reader).records() {
794            let record = record?;
795            let surface = field(&record, SURFACE_COLUMN)?;
796            let reading = field(&record, options.reading_field.column())?;
797
798            if surface == "*" || reading == "*" {
799                continue;
800            }
801            if options.exclude_ascii_surfaces && surface.is_ascii() {
802                continue;
803            }
804            if options.exclude_symbol_pos && is_symbol_pos(field(&record, POS1_COLUMN)?) {
805                continue;
806            }
807
808            by_surface
809                .entry(surface.to_string())
810                .or_default()
811                .insert(reading.to_string());
812        }
813
814        let readings_by_surface = by_surface
815            .into_iter()
816            .map(|(surface, readings)| {
817                let mut readings = readings.into_iter().collect::<Vec<_>>();
818                if let Some(max_readings) = options.max_readings_per_surface {
819                    readings.truncate(max_readings);
820                }
821                (surface, readings)
822            })
823            .filter(|(_, readings)| !readings.is_empty())
824            .collect();
825
826        Ok(Self::from_readings_by_surface(readings_by_surface))
827    }
828
829    /// Loads a YAML artifact payload from a file path.
830    pub fn from_artifact_payload_path(
831        path: impl AsRef<Path>,
832    ) -> Result<Self, UnidicArtifactPayloadError> {
833        let path = path.as_ref();
834        check_payload_file_size(path)?;
835        let file = File::open(path)?;
836        Self::from_artifact_payload_reader(file)
837    }
838
839    /// Loads a YAML artifact payload from a reader.
840    pub fn from_artifact_payload_reader(
841        reader: impl Read,
842    ) -> Result<Self, UnidicArtifactPayloadError> {
843        let payload = serde_yaml::from_reader(reader)?;
844        Self::from_artifact_payload(payload)
845    }
846
847    /// Builds an index from a deserialized artifact payload.
848    pub fn from_artifact_payload(
849        payload: UnidicReadingIndexPayload,
850    ) -> Result<Self, UnidicArtifactPayloadError> {
851        validate_artifact_payload_header(&payload)?;
852        check_limit("entry_count", payload.entries.len(), MAX_ARTIFACT_ENTRIES)?;
853
854        let mut readings_by_surface = HashMap::new();
855        for (entry_index, entry) in payload.entries.into_iter().enumerate() {
856            check_limit(
857                "surface_bytes",
858                entry.surface.len(),
859                MAX_ARTIFACT_STRING_BYTES,
860            )?;
861            check_limit(
862                "reading_count",
863                entry.readings.len(),
864                MAX_ARTIFACT_READINGS_PER_ENTRY,
865            )?;
866            if entry.surface.is_empty() {
867                return Err(UnidicArtifactPayloadError::EmptySurface { entry_index });
868            }
869            if entry.readings.is_empty() {
870                return Err(UnidicArtifactPayloadError::EmptyReadings {
871                    surface: entry.surface,
872                });
873            }
874
875            let mut seen_readings = BTreeSet::new();
876            for (reading_index, reading) in entry.readings.iter().enumerate() {
877                check_limit("reading_bytes", reading.len(), MAX_ARTIFACT_STRING_BYTES)?;
878                if reading.is_empty() {
879                    return Err(UnidicArtifactPayloadError::EmptyReading {
880                        surface: entry.surface,
881                        reading_index,
882                    });
883                }
884                if !seen_readings.insert(reading) {
885                    return Err(UnidicArtifactPayloadError::DuplicateReading {
886                        surface: entry.surface,
887                        reading: reading.clone(),
888                    });
889                }
890            }
891
892            if readings_by_surface
893                .insert(entry.surface.clone(), entry.readings)
894                .is_some()
895            {
896                return Err(UnidicArtifactPayloadError::DuplicateSurface {
897                    surface: entry.surface,
898                });
899            }
900        }
901
902        Ok(Self::from_readings_by_surface(readings_by_surface))
903    }
904
905    /// Loads a binary artifact payload from a file path.
906    pub fn from_binary_artifact_payload_path(
907        path: impl AsRef<Path>,
908    ) -> Result<Self, UnidicArtifactPayloadError> {
909        let path = path.as_ref();
910        check_payload_file_size(path)?;
911        let file = File::open(path)?;
912        Self::from_binary_artifact_payload_reader(file)
913    }
914
915    /// Loads a binary artifact payload from a reader.
916    pub fn from_binary_artifact_payload_reader(
917        mut reader: impl Read,
918    ) -> Result<Self, UnidicArtifactPayloadError> {
919        let header = read_binary_artifact_payload_header(&mut reader)?;
920        check_limit("entry_count", header.entries, MAX_ARTIFACT_ENTRIES)?;
921        let mut entries = Vec::with_capacity(header.entries);
922        for _ in 0..header.entries {
923            let surface = read_binary_string(&mut reader, "surface")?;
924            let reading_count = read_u32_le(&mut reader, "reading_count")?;
925            let reading_count = usize::try_from(reading_count).expect("u32 fits usize");
926            check_limit(
927                "reading_count",
928                reading_count,
929                MAX_ARTIFACT_READINGS_PER_ENTRY,
930            )?;
931            let mut readings = Vec::with_capacity(reading_count);
932            for _ in 0..reading_count {
933                readings.push(read_binary_string(&mut reader, "reading")?);
934            }
935            entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
936        }
937
938        Self::from_artifact_payload(UnidicReadingIndexPayload {
939            schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
940            payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
941            entries,
942        })
943    }
944
945    /// Loads an indexed FST artifact payload from a file path.
946    pub fn from_indexed_artifact_payload_path(
947        path: impl AsRef<Path>,
948    ) -> Result<Self, UnidicArtifactPayloadError> {
949        let path = path.as_ref();
950        check_payload_file_size(path)?;
951        let file = File::open(path)?;
952        // SAFETY: the mmap is kept alive by IndexedUnidicPayload for as long as
953        // any offsets or slices derived from it can be used.
954        let mmap = unsafe { Mmap::map(&file)? };
955        Self::from_indexed_mmap(mmap)
956    }
957
958    /// Loads an indexed artifact payload from bytes.
959    ///
960    /// This eagerly materializes the indexed payload and is intended for
961    /// environments such as WebAssembly where mmap-backed loading is not
962    /// available.
963    ///
964    /// # Errors
965    ///
966    /// Returns an error when the payload is too large, malformed, truncated,
967    /// has an invalid FST section, or fails canonical artifact validation.
968    pub fn from_indexed_artifact_payload_bytes(
969        bytes: &[u8],
970    ) -> Result<Self, UnidicArtifactPayloadError> {
971        if bytes.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
972            return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
973                field: "payload_bytes",
974                len: bytes.len() as u64,
975                max: MAX_ARTIFACT_PAYLOAD_BYTES,
976            });
977        }
978        let header = read_indexed_artifact_payload_header_bytes(bytes)?;
979        let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
980        let fst_end = fst_start.checked_add(header.fst_len).ok_or(
981            UnidicArtifactPayloadError::TruncatedIndexed {
982                field: "fst_section",
983            },
984        )?;
985        let readings_end = fst_end.checked_add(header.readings_len).ok_or(
986            UnidicArtifactPayloadError::TruncatedIndexed {
987                field: "readings_section",
988            },
989        )?;
990        if bytes.len() < readings_end {
991            return Err(UnidicArtifactPayloadError::TruncatedIndexed {
992                field: "indexed_payload",
993            });
994        }
995
996        let map = Map::new(bytes[fst_start..fst_end].to_vec()).map_err(|err| {
997            UnidicArtifactPayloadError::InvalidIndexedFst {
998                message: err.to_string(),
999            }
1000        })?;
1001        let fst_entries = map.len();
1002        if fst_entries != header.entries {
1003            return Err(UnidicArtifactPayloadError::IndexedEntryCountMismatch {
1004                header_entries: header.entries,
1005                fst_entries,
1006            });
1007        }
1008
1009        let mut entries = Vec::with_capacity(header.entries);
1010        let mut stream = map.stream();
1011        while let Some((surface, offset)) = stream.next() {
1012            let surface = std::str::from_utf8(surface)
1013                .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
1014                    field: "surface",
1015                    source,
1016                })?
1017                .to_string();
1018            let readings = read_indexed_readings_at_bytes(bytes, fst_end, offset)?;
1019            entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
1020        }
1021
1022        Self::from_artifact_payload(UnidicReadingIndexPayload {
1023            schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1024            payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1025            entries,
1026        })
1027    }
1028
1029    fn from_indexed_mmap(mmap: Mmap) -> Result<Self, UnidicArtifactPayloadError> {
1030        if mmap.len() as u64 > MAX_ARTIFACT_PAYLOAD_BYTES {
1031            return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1032                field: "payload_bytes",
1033                len: mmap.len() as u64,
1034                max: MAX_ARTIFACT_PAYLOAD_BYTES,
1035            });
1036        }
1037        let header = read_indexed_artifact_payload_header_bytes(&mmap)?;
1038        let fst_start = INDEXED_ARTIFACT_HEADER_LEN;
1039        let fst_end = fst_start.checked_add(header.fst_len).ok_or(
1040            UnidicArtifactPayloadError::TruncatedIndexed {
1041                field: "fst_section",
1042            },
1043        )?;
1044        let readings_end = fst_end.checked_add(header.readings_len).ok_or(
1045            UnidicArtifactPayloadError::TruncatedIndexed {
1046                field: "readings_section",
1047            },
1048        )?;
1049        if mmap.len() < readings_end {
1050            return Err(UnidicArtifactPayloadError::TruncatedIndexed {
1051                field: "indexed_payload",
1052            });
1053        }
1054
1055        let map = Map::new(mmap[fst_start..fst_end].to_vec()).map_err(|err| {
1056            UnidicArtifactPayloadError::InvalidIndexedFst {
1057                message: err.to_string(),
1058            }
1059        })?;
1060        let fst_entries = map.len();
1061        if fst_entries != header.entries {
1062            return Err(UnidicArtifactPayloadError::IndexedEntryCountMismatch {
1063                header_entries: header.entries,
1064                fst_entries,
1065            });
1066        }
1067
1068        let indexed = IndexedUnidicPayload {
1069            mmap: Arc::new(mmap),
1070            map,
1071            readings_start: fst_end,
1072            entries: header.entries,
1073        };
1074        indexed.validate()?;
1075        Ok(Self {
1076            storage: UnidicReadingStorage::Indexed(indexed),
1077        })
1078    }
1079
1080    /// Reads only the header from a binary artifact payload file.
1081    pub fn binary_artifact_payload_header_path(
1082        path: impl AsRef<Path>,
1083    ) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1084        let file = File::open(path)?;
1085        Self::binary_artifact_payload_header_reader(file)
1086    }
1087
1088    /// Reads only the header from a binary artifact payload reader.
1089    pub fn binary_artifact_payload_header_reader(
1090        mut reader: impl Read,
1091    ) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1092        read_binary_artifact_payload_header(&mut reader)
1093    }
1094
1095    fn from_readings_by_surface(readings_by_surface: HashMap<String, Vec<String>>) -> Self {
1096        Self {
1097            storage: UnidicReadingStorage::Eager(readings_by_surface),
1098        }
1099    }
1100
1101    /// Returns readings for `surface`, if present.
1102    ///
1103    /// For indexed artifacts, decode errors are treated the same as a missing
1104    /// surface for backward compatibility. Use [`Self::try_readings`] at trust
1105    /// boundaries when artifact corruption must be reported distinctly.
1106    pub fn readings(&self, surface: &str) -> Option<Cow<'_, [String]>> {
1107        self.try_readings(surface).ok().flatten()
1108    }
1109
1110    /// Returns readings for `surface` and preserves indexed artifact decode
1111    /// errors.
1112    pub fn try_readings(
1113        &self,
1114        surface: &str,
1115    ) -> Result<Option<Cow<'_, [String]>>, UnidicArtifactPayloadError> {
1116        match &self.storage {
1117            UnidicReadingStorage::Eager(readings_by_surface) => Ok(readings_by_surface
1118                .get(surface)
1119                .map(|readings| Cow::Borrowed(readings.as_slice()))),
1120            UnidicReadingStorage::Indexed(indexed) => indexed
1121                .readings(surface)
1122                .map(|readings| readings.map(Cow::Owned)),
1123        }
1124    }
1125
1126    /// Returns the number of indexed surface forms.
1127    pub fn len(&self) -> usize {
1128        match &self.storage {
1129            UnidicReadingStorage::Eager(readings_by_surface) => readings_by_surface.len(),
1130            UnidicReadingStorage::Indexed(indexed) => indexed.entries,
1131        }
1132    }
1133
1134    /// Returns `true` when the index contains no surface forms.
1135    pub fn is_empty(&self) -> bool {
1136        self.len() == 0
1137    }
1138
1139    /// Builds bundle metadata for the current index and caller-provided
1140    /// provenance.
1141    ///
1142    /// The returned metadata includes a canonical payload checksum computed
1143    /// from the normalized payload view.
1144    pub fn artifact_metadata(
1145        &self,
1146        options: UnidicArtifactMetadataOptions,
1147    ) -> UnidicArtifactMetadata {
1148        UnidicArtifactMetadata {
1149            schema_version: 1,
1150            artifact_type: "moine.unidic.reading-index".to_string(),
1151            artifact_name: options.artifact_name,
1152            generator: options.generator,
1153            payload: UnidicArtifactPayload {
1154                path: options.payload_file_name,
1155                format: options.payload_format,
1156                file_digest_algorithm: None,
1157                file_digest: None,
1158                checksum_algorithm: ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM.to_string(),
1159                checksum: self.artifact_payload_checksum(),
1160            },
1161            source: UnidicArtifactSource {
1162                name: options.source_name,
1163                version: options.source_version,
1164                lex_csv: options.source_lex_csv,
1165            },
1166            build: UnidicArtifactBuild {
1167                reading_field: options.index_options.reading_field.as_str().to_string(),
1168                max_readings_per_surface: options.index_options.max_readings_per_surface,
1169                exclude_ascii_surfaces: options.index_options.exclude_ascii_surfaces,
1170                exclude_symbol_pos: options.index_options.exclude_symbol_pos,
1171                entries: self.len(),
1172            },
1173            query_defaults: UnidicArtifactQueryDefaults {
1174                max_span_chars: options.query_defaults.max_span_chars,
1175                max_paths: options.query_defaults.max_paths,
1176                longest_match_only: options.query_defaults.longest_match_only,
1177                max_readings_per_segment: options.query_defaults.max_readings_per_segment,
1178            },
1179            license: options.license,
1180        }
1181    }
1182
1183    /// Returns the normalized YAML-compatible payload view for this index.
1184    ///
1185    /// Entries are sorted by surface form so serialization and checksums are
1186    /// deterministic regardless of the index storage backend.
1187    pub fn artifact_payload(&self) -> UnidicReadingIndexPayload {
1188        let entries = match &self.storage {
1189            UnidicReadingStorage::Eager(readings_by_surface) => {
1190                let mut entries = readings_by_surface
1191                    .iter()
1192                    .map(|(surface, readings)| UnidicReadingIndexPayloadEntry {
1193                        surface: surface.clone(),
1194                        readings: readings.clone(),
1195                    })
1196                    .collect::<Vec<_>>();
1197                entries.sort_by(|left, right| left.surface.cmp(&right.surface));
1198                entries
1199            }
1200            UnidicReadingStorage::Indexed(indexed) => indexed
1201                .entries()
1202                .expect("validated indexed artifact should decode"),
1203        };
1204
1205        UnidicReadingIndexPayload {
1206            schema_version: ARTIFACT_PAYLOAD_SCHEMA_VERSION,
1207            payload_type: ARTIFACT_PAYLOAD_TYPE.to_string(),
1208            entries,
1209        }
1210    }
1211
1212    /// Returns the canonical checksum for the normalized payload.
1213    pub fn artifact_payload_checksum(&self) -> String {
1214        self.artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
1215            .expect("default artifact checksum algorithm should be supported")
1216    }
1217
1218    /// Returns a canonical payload checksum for `algorithm`.
1219    ///
1220    /// Supported values are [`ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM`] and
1221    /// [`LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM`]. Unknown algorithms return
1222    /// `None`.
1223    pub fn artifact_payload_checksum_for_algorithm(&self, algorithm: &str) -> Option<String> {
1224        let payload = self.artifact_payload();
1225        let bytes = canonical_payload_bytes(&payload);
1226        match algorithm {
1227            ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(sha256_hex(&bytes)),
1228            LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM => Some(format!("{:016x}", fnv1a64(&bytes))),
1229            _ => None,
1230        }
1231    }
1232
1233    /// Writes the legacy binary artifact payload format.
1234    ///
1235    /// Prefer [`Self::write_indexed_artifact_payload`] for newly generated
1236    /// bundles; this format is kept for compatibility with older artifacts.
1237    pub fn write_artifact_binary_payload(
1238        &self,
1239        mut writer: impl Write,
1240    ) -> Result<(), UnidicArtifactPayloadError> {
1241        let payload = self.artifact_payload();
1242        writer.write_all(BINARY_ARTIFACT_MAGIC)?;
1243        writer.write_all(&BINARY_ARTIFACT_VERSION.to_le_bytes())?;
1244        writer.write_all(&0_u32.to_le_bytes())?;
1245        writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1246
1247        for entry in &payload.entries {
1248            write_binary_string(&mut writer, "surface", &entry.surface)?;
1249            write_u32_len(&mut writer, "reading_count", entry.readings.len())?;
1250            for reading in &entry.readings {
1251                write_binary_string(&mut writer, "reading", reading)?;
1252            }
1253        }
1254
1255        Ok(())
1256    }
1257
1258    /// Writes the indexed FST-backed artifact payload format.
1259    ///
1260    /// The payload stores a finite-state transducer from surface form to an
1261    /// offset in a compact reading blob and can be loaded with
1262    /// [`Self::from_indexed_artifact_payload_path`].
1263    pub fn write_indexed_artifact_payload(
1264        &self,
1265        mut writer: impl Write,
1266    ) -> Result<(), UnidicArtifactPayloadError> {
1267        let payload = self.artifact_payload();
1268        let mut fst_bytes = Vec::new();
1269        let mut readings_bytes = Vec::new();
1270        {
1271            let mut builder = MapBuilder::new(&mut fst_bytes).map_err(|err| {
1272                UnidicArtifactPayloadError::InvalidIndexedFst {
1273                    message: err.to_string(),
1274                }
1275            })?;
1276            for entry in &payload.entries {
1277                let offset = readings_bytes.len() as u64;
1278                builder.insert(&entry.surface, offset).map_err(|err| {
1279                    UnidicArtifactPayloadError::InvalidIndexedFst {
1280                        message: err.to_string(),
1281                    }
1282                })?;
1283                write_indexed_reading_block(&mut readings_bytes, &entry.readings)?;
1284            }
1285            builder
1286                .finish()
1287                .map_err(|err| UnidicArtifactPayloadError::InvalidIndexedFst {
1288                    message: err.to_string(),
1289                })?;
1290        }
1291
1292        writer.write_all(INDEXED_ARTIFACT_MAGIC)?;
1293        writer.write_all(&INDEXED_ARTIFACT_VERSION.to_le_bytes())?;
1294        writer.write_all(&0_u32.to_le_bytes())?;
1295        writer.write_all(&(payload.entries.len() as u64).to_le_bytes())?;
1296        writer.write_all(&(fst_bytes.len() as u64).to_le_bytes())?;
1297        writer.write_all(&(readings_bytes.len() as u64).to_le_bytes())?;
1298        writer.write_all(&fst_bytes)?;
1299        writer.write_all(&readings_bytes)?;
1300        Ok(())
1301    }
1302
1303    /// Expands `text` into joined kana reading strings.
1304    ///
1305    /// This is a compatibility helper over [`Self::reading_paths`]. It drops
1306    /// segment boundaries and treats indexed artifact decode errors as an empty
1307    /// expansion.
1308    pub fn reading_sequences(&self, text: &str, options: DictionaryReadingOptions) -> Vec<String> {
1309        self.reading_sequences_with_stats_inner(text, options, false)
1310            .unwrap_or_default()
1311            .paths
1312    }
1313
1314    /// Expands `text` into dictionary-only reading paths.
1315    ///
1316    /// Every returned path contains surface/reading segment boundaries plus the
1317    /// joined kana reading. Use [`Self::try_reading_paths_with_stats`] when
1318    /// indexed artifact corruption must be reported.
1319    pub fn reading_paths(
1320        &self,
1321        text: &str,
1322        options: DictionaryReadingOptions,
1323    ) -> Vec<DictionaryReadingPath> {
1324        self.reading_paths_with_stats(text, options).paths
1325    }
1326
1327    /// Expands dictionary reading paths and treats artifact decode errors as an
1328    /// empty expansion for backward compatibility.
1329    ///
1330    /// Use [`Self::try_reading_paths_with_stats`] when loading indexed
1331    /// artifacts from outside the process trust boundary.
1332    pub fn reading_paths_with_stats(
1333        &self,
1334        text: &str,
1335        options: DictionaryReadingOptions,
1336    ) -> DictionaryReadingExpansion {
1337        self.try_reading_paths_with_stats(text, options)
1338            .unwrap_or_default()
1339    }
1340
1341    /// Expands dictionary reading paths and preserves indexed artifact decode
1342    /// errors.
1343    pub fn try_reading_paths_with_stats(
1344        &self,
1345        text: &str,
1346        options: DictionaryReadingOptions,
1347    ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1348        self.reading_paths_with_stats_inner(text, options, false)
1349    }
1350
1351    /// Expands `text` into reading paths with direct fallback segments.
1352    ///
1353    /// Dictionary matches are preferred, but kana and ASCII spans can pass
1354    /// through directly so mixed dictionary/direct input can still form a full
1355    /// path.
1356    pub fn hybrid_reading_paths(
1357        &self,
1358        text: &str,
1359        options: DictionaryReadingOptions,
1360    ) -> Vec<DictionaryReadingPath> {
1361        self.hybrid_reading_paths_with_stats(text, options).paths
1362    }
1363
1364    /// Expands hybrid dictionary/direct reading paths and treats artifact
1365    /// decode errors as an empty expansion for backward compatibility.
1366    ///
1367    /// Use [`Self::try_hybrid_reading_paths_with_stats`] when loading indexed
1368    /// artifacts from outside the process trust boundary.
1369    pub fn hybrid_reading_paths_with_stats(
1370        &self,
1371        text: &str,
1372        options: DictionaryReadingOptions,
1373    ) -> DictionaryReadingExpansion {
1374        self.try_hybrid_reading_paths_with_stats(text, options)
1375            .unwrap_or_default()
1376    }
1377
1378    /// Expands hybrid dictionary/direct reading paths and preserves indexed
1379    /// artifact decode errors.
1380    pub fn try_hybrid_reading_paths_with_stats(
1381        &self,
1382        text: &str,
1383        options: DictionaryReadingOptions,
1384    ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1385        self.reading_paths_with_stats_inner(text, options, true)
1386    }
1387
1388    fn reading_paths_with_stats_inner(
1389        &self,
1390        text: &str,
1391        options: DictionaryReadingOptions,
1392        allow_direct_fallback: bool,
1393    ) -> Result<DictionaryReadingExpansion, UnidicArtifactPayloadError> {
1394        if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1395            return Ok(DictionaryReadingExpansion::default());
1396        }
1397
1398        let mut stats = DictionaryReadingStats::default();
1399        let boundaries = char_boundaries(text);
1400        let char_len = boundaries.len() - 1;
1401        let mut suffix_paths = vec![Vec::<DictionaryReadingPath>::new(); char_len + 1];
1402        suffix_paths[char_len].push(DictionaryReadingPath {
1403            segments: Vec::new(),
1404            joined_reading: String::new(),
1405        });
1406
1407        for start in (0..char_len).rev() {
1408            let mut paths_by_reading = std::collections::BTreeMap::new();
1409            let end_limit = char_len.min(start + options.max_span_chars);
1410            let mut matching_ends = Vec::new();
1411
1412            for end in start + 1..=end_limit {
1413                let surface = &text[boundaries[start]..boundaries[end]];
1414                if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1415                    matching_ends.push(end);
1416                }
1417            }
1418            stats.matched_spans += matching_ends.len();
1419
1420            if options.longest_match_only && !allow_direct_fallback {
1421                if let Some(end) = matching_ends.last().copied() {
1422                    stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1423                    matching_ends.clear();
1424                    matching_ends.push(end);
1425                }
1426            }
1427
1428            for end in matching_ends {
1429                let surface = &text[boundaries[start]..boundaries[end]];
1430                let Some(surface_readings) = self.try_readings(surface)? else {
1431                    continue;
1432                };
1433
1434                stats.raw_segment_readings += surface_readings.len();
1435                let raw_surface_reading_count = surface_readings.len();
1436                let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1437                stats.used_segment_readings += surface_readings.len();
1438                stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1439                for surface_reading in surface_readings {
1440                    for suffix in &suffix_paths[end] {
1441                        stats.candidate_combinations += 1;
1442                        let mut reading = String::with_capacity(
1443                            surface_reading.len() + suffix.joined_reading.len(),
1444                        );
1445                        reading.push_str(surface_reading);
1446                        reading.push_str(&suffix.joined_reading);
1447
1448                        let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1449                        segments.push(DictionaryReadingSegment {
1450                            surface: surface.to_string(),
1451                            reading: surface_reading.to_string(),
1452                        });
1453                        segments.extend(suffix.segments.iter().cloned());
1454
1455                        match paths_by_reading.entry(reading.clone()) {
1456                            Entry::Vacant(entry) => {
1457                                entry.insert(DictionaryReadingPath {
1458                                    segments,
1459                                    joined_reading: reading,
1460                                });
1461                                stats.unique_paths += 1;
1462                            }
1463                            Entry::Occupied(_) => {
1464                                stats.duplicate_joined_readings += 1;
1465                            }
1466                        }
1467
1468                        if paths_by_reading.len() >= options.max_paths {
1469                            stats.max_paths_hit_count += 1;
1470                            break;
1471                        }
1472                    }
1473
1474                    if paths_by_reading.len() >= options.max_paths {
1475                        break;
1476                    }
1477                }
1478
1479                if paths_by_reading.len() >= options.max_paths {
1480                    break;
1481                }
1482            }
1483
1484            if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1485                if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1486                    if !suffix_paths[end].is_empty() {
1487                        stats.direct_fallback_spans += 1;
1488                        let surface = &text[boundaries[start]..boundaries[end]];
1489                        for suffix in &suffix_paths[end] {
1490                            stats.candidate_combinations += 1;
1491                            let mut reading =
1492                                String::with_capacity(surface.len() + suffix.joined_reading.len());
1493                            reading.push_str(surface);
1494                            reading.push_str(&suffix.joined_reading);
1495
1496                            let mut segments = Vec::with_capacity(suffix.segments.len() + 1);
1497                            segments.push(DictionaryReadingSegment {
1498                                surface: surface.to_string(),
1499                                reading: surface.to_string(),
1500                            });
1501                            segments.extend(suffix.segments.iter().cloned());
1502
1503                            match paths_by_reading.entry(reading.clone()) {
1504                                Entry::Vacant(entry) => {
1505                                    entry.insert(DictionaryReadingPath {
1506                                        segments,
1507                                        joined_reading: reading,
1508                                    });
1509                                    stats.unique_paths += 1;
1510                                }
1511                                Entry::Occupied(_) => {
1512                                    stats.duplicate_joined_readings += 1;
1513                                }
1514                            }
1515
1516                            if paths_by_reading.len() >= options.max_paths {
1517                                stats.max_paths_hit_count += 1;
1518                                break;
1519                            }
1520                        }
1521                    }
1522                }
1523            }
1524
1525            suffix_paths[start] = paths_by_reading.into_values().collect();
1526        }
1527
1528        Ok(DictionaryReadingExpansion {
1529            paths: suffix_paths.remove(0),
1530            stats,
1531        })
1532    }
1533
1534    fn reading_sequences_with_stats_inner(
1535        &self,
1536        text: &str,
1537        options: DictionaryReadingOptions,
1538        allow_direct_fallback: bool,
1539    ) -> Result<DictionaryReadingSequenceExpansion, UnidicArtifactPayloadError> {
1540        if text.is_empty() || options.max_span_chars == 0 || options.max_paths == 0 {
1541            return Ok(DictionaryReadingSequenceExpansion::default());
1542        }
1543
1544        let mut stats = DictionaryReadingStats::default();
1545        let boundaries = char_boundaries(text);
1546        let char_len = boundaries.len() - 1;
1547        let mut suffix_paths = vec![Vec::<String>::new(); char_len + 1];
1548        suffix_paths[char_len].push(String::new());
1549
1550        for start in (0..char_len).rev() {
1551            let mut paths_by_reading = BTreeSet::new();
1552            let end_limit = char_len.min(start + options.max_span_chars);
1553            let mut matching_ends = Vec::new();
1554
1555            for end in start + 1..=end_limit {
1556                let surface = &text[boundaries[start]..boundaries[end]];
1557                if self.try_readings(surface)?.is_some() && !suffix_paths[end].is_empty() {
1558                    matching_ends.push(end);
1559                }
1560            }
1561            stats.matched_spans += matching_ends.len();
1562
1563            if options.longest_match_only && !allow_direct_fallback {
1564                if let Some(end) = matching_ends.last().copied() {
1565                    stats.longest_match_pruned_spans += matching_ends.len().saturating_sub(1);
1566                    matching_ends.clear();
1567                    matching_ends.push(end);
1568                }
1569            }
1570
1571            for end in matching_ends {
1572                let surface = &text[boundaries[start]..boundaries[end]];
1573                let Some(surface_readings) = self.try_readings(surface)? else {
1574                    continue;
1575                };
1576
1577                stats.raw_segment_readings += surface_readings.len();
1578                let raw_surface_reading_count = surface_readings.len();
1579                let surface_readings = limited_surface_readings(surface_readings.as_ref(), options);
1580                stats.used_segment_readings += surface_readings.len();
1581                stats.pruned_segment_readings += raw_surface_reading_count - surface_readings.len();
1582                for surface_reading in surface_readings {
1583                    for suffix in &suffix_paths[end] {
1584                        stats.candidate_combinations += 1;
1585                        let mut reading =
1586                            String::with_capacity(surface_reading.len() + suffix.len());
1587                        reading.push_str(surface_reading);
1588                        reading.push_str(suffix);
1589
1590                        if paths_by_reading.insert(reading) {
1591                            stats.unique_paths += 1;
1592                        } else {
1593                            stats.duplicate_joined_readings += 1;
1594                        }
1595
1596                        if paths_by_reading.len() >= options.max_paths {
1597                            stats.max_paths_hit_count += 1;
1598                            break;
1599                        }
1600                    }
1601
1602                    if paths_by_reading.len() >= options.max_paths {
1603                        break;
1604                    }
1605                }
1606
1607                if paths_by_reading.len() >= options.max_paths {
1608                    break;
1609                }
1610            }
1611
1612            if allow_direct_fallback && paths_by_reading.len() < options.max_paths {
1613                if let Some(end) = direct_fallback_end(text, &boundaries, start, char_len) {
1614                    if !suffix_paths[end].is_empty() {
1615                        stats.direct_fallback_spans += 1;
1616                        let surface = &text[boundaries[start]..boundaries[end]];
1617                        for suffix in &suffix_paths[end] {
1618                            stats.candidate_combinations += 1;
1619                            let mut reading = String::with_capacity(surface.len() + suffix.len());
1620                            reading.push_str(surface);
1621                            reading.push_str(suffix);
1622
1623                            if paths_by_reading.insert(reading) {
1624                                stats.unique_paths += 1;
1625                            } else {
1626                                stats.duplicate_joined_readings += 1;
1627                            }
1628
1629                            if paths_by_reading.len() >= options.max_paths {
1630                                stats.max_paths_hit_count += 1;
1631                                break;
1632                            }
1633                        }
1634                    }
1635                }
1636            }
1637
1638            suffix_paths[start] = paths_by_reading.into_iter().collect();
1639        }
1640
1641        Ok(DictionaryReadingSequenceExpansion {
1642            paths: suffix_paths.remove(0),
1643            stats,
1644        })
1645    }
1646
1647    /// Builds a romaji lattice from dictionary-only readings of `text`.
1648    ///
1649    /// Returns `Ok(None)` when the dictionary cannot cover the entire input.
1650    /// Indexed artifact decode errors are reported as
1651    /// [`JaLatticeError::ArtifactPayload`].
1652    pub fn romaji_lattice(
1653        &self,
1654        text: &str,
1655        options: DictionaryReadingOptions,
1656    ) -> Result<Option<Lattice>, JaLatticeError> {
1657        let readings = self
1658            .reading_sequences_with_stats_inner(text, options, false)
1659            .map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?;
1660        if readings.paths.is_empty() {
1661            return Ok(None);
1662        }
1663
1664        crate::romaji::romaji_lattice_from_readings(readings.paths).map(Some)
1665    }
1666
1667    /// Builds a romaji lattice with dictionary readings and direct fallback.
1668    ///
1669    /// This is the preferred lattice builder for mixed Japanese text where
1670    /// kana or ASCII spans may appear beside UniDic-backed surfaces.
1671    pub fn hybrid_romaji_lattice(
1672        &self,
1673        text: &str,
1674        options: DictionaryReadingOptions,
1675    ) -> Result<Option<Lattice>, JaLatticeError> {
1676        let readings = self
1677            .reading_sequences_with_stats_inner(text, options, true)
1678            .map_err(|err| JaLatticeError::ArtifactPayload(err.to_string()))?;
1679        if readings.paths.is_empty() {
1680            return Ok(None);
1681        }
1682
1683        crate::romaji::romaji_lattice_from_readings(readings.paths).map(Some)
1684    }
1685}
1686
1687#[derive(Clone, Debug, Default, Eq, PartialEq)]
1688struct DictionaryReadingSequenceExpansion {
1689    paths: Vec<String>,
1690    stats: DictionaryReadingStats,
1691}
1692
1693fn char_boundaries(text: &str) -> Vec<usize> {
1694    text.char_indices()
1695        .map(|(index, _)| index)
1696        .chain(std::iter::once(text.len()))
1697        .collect()
1698}
1699
1700fn lex_csv_reader(reader: impl Read) -> csv::Reader<impl Read> {
1701    csv::ReaderBuilder::new()
1702        .has_headers(false)
1703        .flexible(true)
1704        .from_reader(reader)
1705}
1706
1707fn field(record: &csv::StringRecord, column: usize) -> Result<&str, UnidicCsvError> {
1708    record
1709        .get(column)
1710        .ok_or_else(|| UnidicCsvError::MissingColumn {
1711            record_index: record
1712                .position()
1713                .map(|position| position.record())
1714                .unwrap_or(0),
1715            column,
1716            len: record.len(),
1717        })
1718}
1719
1720fn is_symbol_pos(pos1: &str) -> bool {
1721    pos1.contains("記号")
1722}
1723
1724fn limited_surface_readings(readings: &[String], options: DictionaryReadingOptions) -> &[String] {
1725    if let Some(max_readings) = options.max_readings_per_segment {
1726        &readings[..readings.len().min(max_readings)]
1727    } else {
1728        readings
1729    }
1730}
1731
1732fn direct_fallback_end(
1733    text: &str,
1734    boundaries: &[usize],
1735    start: usize,
1736    char_len: usize,
1737) -> Option<usize> {
1738    let mut end = start;
1739    while end < char_len {
1740        let surface = &text[boundaries[start]..boundaries[end + 1]];
1741        if !can_build_romaji_paths(surface) {
1742            break;
1743        }
1744        end += 1;
1745    }
1746
1747    (end > start).then_some(end)
1748}
1749
1750fn write_binary_string(
1751    writer: &mut impl Write,
1752    field: &'static str,
1753    value: &str,
1754) -> Result<(), UnidicArtifactPayloadError> {
1755    write_u32_len(writer, field, value.len())?;
1756    writer.write_all(value.as_bytes())?;
1757    Ok(())
1758}
1759
1760fn write_u32_len(
1761    writer: &mut impl Write,
1762    field: &'static str,
1763    len: usize,
1764) -> Result<(), UnidicArtifactPayloadError> {
1765    let len = u32::try_from(len)
1766        .map_err(|_| UnidicArtifactPayloadError::BinaryValueTooLarge { field, len })?;
1767    writer.write_all(&len.to_le_bytes())?;
1768    Ok(())
1769}
1770
1771fn read_binary_string(
1772    reader: &mut impl Read,
1773    field: &'static str,
1774) -> Result<String, UnidicArtifactPayloadError> {
1775    let len = read_u32_le(reader, field)? as usize;
1776    check_limit(field, len, MAX_ARTIFACT_STRING_BYTES)?;
1777    let mut bytes = vec![0_u8; len];
1778    read_exact_binary(reader, &mut bytes, field)?;
1779    String::from_utf8(bytes)
1780        .map_err(|source| UnidicArtifactPayloadError::InvalidBinaryUtf8 { field, source })
1781}
1782
1783fn read_u32_le(
1784    reader: &mut impl Read,
1785    field: &'static str,
1786) -> Result<u32, UnidicArtifactPayloadError> {
1787    let mut bytes = [0_u8; 4];
1788    read_exact_binary(reader, &mut bytes, field)?;
1789    Ok(u32::from_le_bytes(bytes))
1790}
1791
1792fn read_u64_le(
1793    reader: &mut impl Read,
1794    field: &'static str,
1795) -> Result<u64, UnidicArtifactPayloadError> {
1796    let mut bytes = [0_u8; 8];
1797    read_exact_binary(reader, &mut bytes, field)?;
1798    Ok(u64::from_le_bytes(bytes))
1799}
1800
1801fn read_exact_binary(
1802    reader: &mut impl Read,
1803    bytes: &mut [u8],
1804    field: &'static str,
1805) -> Result<(), UnidicArtifactPayloadError> {
1806    match reader.read_exact(bytes) {
1807        Ok(()) => Ok(()),
1808        Err(err) if err.kind() == std::io::ErrorKind::UnexpectedEof => {
1809            Err(UnidicArtifactPayloadError::TruncatedBinary { field })
1810        }
1811        Err(err) => Err(UnidicArtifactPayloadError::Io(err)),
1812    }
1813}
1814
1815fn read_binary_artifact_payload_header(
1816    reader: &mut impl Read,
1817) -> Result<UnidicBinaryArtifactPayloadHeader, UnidicArtifactPayloadError> {
1818    let mut magic = [0_u8; 8];
1819    read_exact_binary(reader, &mut magic, "magic")?;
1820    if &magic != BINARY_ARTIFACT_MAGIC {
1821        return Err(UnidicArtifactPayloadError::InvalidBinaryMagic { magic });
1822    }
1823
1824    let version = read_u32_le(reader, "version")?;
1825    if version != BINARY_ARTIFACT_VERSION {
1826        return Err(UnidicArtifactPayloadError::UnsupportedBinaryVersion { version });
1827    }
1828
1829    let reserved = read_u32_le(reader, "reserved")?;
1830    if reserved != 0 {
1831        return Err(UnidicArtifactPayloadError::NonZeroBinaryReserved { value: reserved });
1832    }
1833
1834    let entry_count = read_u64_le(reader, "entry_count")?;
1835    let entries = usize::try_from(entry_count).map_err(|_| {
1836        UnidicArtifactPayloadError::BinaryEntryCountTooLarge {
1837            entries: entry_count,
1838        }
1839    })?;
1840    check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1841
1842    Ok(UnidicBinaryArtifactPayloadHeader { version, entries })
1843}
1844
1845fn read_indexed_artifact_payload_header_bytes(
1846    bytes: &[u8],
1847) -> Result<UnidicIndexedArtifactPayloadHeader, UnidicArtifactPayloadError> {
1848    if bytes.len() < INDEXED_ARTIFACT_HEADER_LEN {
1849        return Err(UnidicArtifactPayloadError::TruncatedIndexed { field: "header" });
1850    }
1851    let mut magic = [0_u8; 8];
1852    magic.copy_from_slice(&bytes[..8]);
1853    if &magic != INDEXED_ARTIFACT_MAGIC {
1854        return Err(UnidicArtifactPayloadError::InvalidIndexedMagic { magic });
1855    }
1856
1857    let version = read_u32_le_bytes(bytes, 8, "version")?;
1858    if version != INDEXED_ARTIFACT_VERSION {
1859        return Err(UnidicArtifactPayloadError::UnsupportedIndexedVersion { version });
1860    }
1861    let reserved = read_u32_le_bytes(bytes, 12, "reserved")?;
1862    if reserved != 0 {
1863        return Err(UnidicArtifactPayloadError::NonZeroIndexedReserved { value: reserved });
1864    }
1865    let entry_count = read_u64_le_bytes(bytes, 16, "entry_count")?;
1866    let fst_len = read_u64_le_bytes(bytes, 24, "fst_len")?;
1867    let readings_len = read_u64_le_bytes(bytes, 32, "readings_len")?;
1868    let entries = checked_indexed_usize("entry_count", entry_count)?;
1869    check_limit("entry_count", entries, MAX_ARTIFACT_ENTRIES)?;
1870    Ok(UnidicIndexedArtifactPayloadHeader {
1871        version,
1872        entries,
1873        fst_len: checked_indexed_usize("fst_len", fst_len)?,
1874        readings_len: checked_indexed_usize("readings_len", readings_len)?,
1875    })
1876}
1877
1878fn read_u32_le_bytes(
1879    bytes: &[u8],
1880    offset: usize,
1881    field: &'static str,
1882) -> Result<u32, UnidicArtifactPayloadError> {
1883    let end = offset
1884        .checked_add(4)
1885        .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1886    let chunk = bytes
1887        .get(offset..end)
1888        .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1889    Ok(u32::from_le_bytes(
1890        chunk.try_into().expect("slice length is 4"),
1891    ))
1892}
1893
1894fn read_u64_le_bytes(
1895    bytes: &[u8],
1896    offset: usize,
1897    field: &'static str,
1898) -> Result<u64, UnidicArtifactPayloadError> {
1899    let end = offset
1900        .checked_add(8)
1901        .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1902    let chunk = bytes
1903        .get(offset..end)
1904        .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field })?;
1905    Ok(u64::from_le_bytes(
1906        chunk.try_into().expect("slice length is 8"),
1907    ))
1908}
1909
1910fn checked_indexed_usize(
1911    field: &'static str,
1912    len: u64,
1913) -> Result<usize, UnidicArtifactPayloadError> {
1914    usize::try_from(len)
1915        .map_err(|_| UnidicArtifactPayloadError::IndexedSectionTooLarge { field, len })
1916}
1917
1918fn check_payload_file_size(path: &Path) -> Result<(), UnidicArtifactPayloadError> {
1919    let len = std::fs::metadata(path)?.len();
1920    if len > MAX_ARTIFACT_PAYLOAD_BYTES {
1921        return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1922            field: "payload_bytes",
1923            len,
1924            max: MAX_ARTIFACT_PAYLOAD_BYTES,
1925        });
1926    }
1927    Ok(())
1928}
1929
1930fn check_limit(
1931    field: &'static str,
1932    len: usize,
1933    max: usize,
1934) -> Result<(), UnidicArtifactPayloadError> {
1935    if len > max {
1936        return Err(UnidicArtifactPayloadError::ArtifactLimitExceeded {
1937            field,
1938            len: len as u64,
1939            max: max as u64,
1940        });
1941    }
1942    Ok(())
1943}
1944
1945fn write_indexed_reading_block(
1946    writer: &mut Vec<u8>,
1947    readings: &[String],
1948) -> Result<(), UnidicArtifactPayloadError> {
1949    write_u32_len(writer, "reading_count", readings.len())?;
1950    for reading in readings {
1951        write_binary_string(writer, "reading", reading)?;
1952    }
1953    Ok(())
1954}
1955
1956impl IndexedUnidicPayload {
1957    fn validate(&self) -> Result<(), UnidicArtifactPayloadError> {
1958        let mut stream = self.map.stream();
1959        while let Some((surface, offset)) = stream.next() {
1960            let surface = std::str::from_utf8(surface).map_err(|source| {
1961                UnidicArtifactPayloadError::InvalidIndexedUtf8 {
1962                    field: "surface",
1963                    source,
1964                }
1965            })?;
1966            if surface.is_empty() {
1967                return Err(UnidicArtifactPayloadError::EmptySurface { entry_index: 0 });
1968            }
1969            let readings = self.readings_at(offset)?;
1970            if readings.is_empty() {
1971                return Err(UnidicArtifactPayloadError::EmptyReadings {
1972                    surface: surface.to_string(),
1973                });
1974            }
1975            let mut seen = BTreeSet::new();
1976            for (reading_index, reading) in readings.iter().enumerate() {
1977                if reading.is_empty() {
1978                    return Err(UnidicArtifactPayloadError::EmptyReading {
1979                        surface: surface.to_string(),
1980                        reading_index,
1981                    });
1982                }
1983                if !seen.insert(reading) {
1984                    return Err(UnidicArtifactPayloadError::DuplicateReading {
1985                        surface: surface.to_string(),
1986                        reading: reading.clone(),
1987                    });
1988                }
1989            }
1990        }
1991        Ok(())
1992    }
1993
1994    fn readings(&self, surface: &str) -> Result<Option<Vec<String>>, UnidicArtifactPayloadError> {
1995        self.map
1996            .get(surface)
1997            .map(|offset| self.readings_at(offset))
1998            .transpose()
1999    }
2000
2001    fn entries(&self) -> Result<Vec<UnidicReadingIndexPayloadEntry>, UnidicArtifactPayloadError> {
2002        let mut entries = Vec::with_capacity(self.entries);
2003        let mut stream = self.map.stream();
2004        while let Some((surface, offset)) = stream.next() {
2005            let surface = std::str::from_utf8(surface)
2006                .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
2007                    field: "surface",
2008                    source,
2009                })?
2010                .to_string();
2011            let readings = self.readings_at(offset)?;
2012            entries.push(UnidicReadingIndexPayloadEntry { surface, readings });
2013        }
2014        Ok(entries)
2015    }
2016
2017    fn readings_at(&self, offset: u64) -> Result<Vec<String>, UnidicArtifactPayloadError> {
2018        read_indexed_readings_at_bytes(&self.mmap, self.readings_start, offset)
2019    }
2020}
2021
2022fn read_indexed_readings_at_bytes(
2023    bytes: &[u8],
2024    readings_start: usize,
2025    offset: u64,
2026) -> Result<Vec<String>, UnidicArtifactPayloadError> {
2027    let offset = usize::try_from(offset)
2028        .map_err(|_| UnidicArtifactPayloadError::InvalidIndexedOffset { offset })?;
2029    let start = readings_start.checked_add(offset).ok_or(
2030        UnidicArtifactPayloadError::InvalidIndexedOffset {
2031            offset: offset as u64,
2032        },
2033    )?;
2034    if start >= bytes.len() {
2035        return Err(UnidicArtifactPayloadError::InvalidIndexedOffset {
2036            offset: offset as u64,
2037        });
2038    }
2039    let mut cursor = start;
2040    let reading_count = read_u32_le_bytes(bytes, cursor, "reading_count")? as usize;
2041    check_limit(
2042        "reading_count",
2043        reading_count,
2044        MAX_ARTIFACT_READINGS_PER_ENTRY,
2045    )?;
2046    cursor += 4;
2047    let mut readings = Vec::with_capacity(reading_count);
2048    for _ in 0..reading_count {
2049        let len = read_u32_le_bytes(bytes, cursor, "reading_len")? as usize;
2050        check_limit("reading_bytes", len, MAX_ARTIFACT_STRING_BYTES)?;
2051        cursor += 4;
2052        let end = cursor
2053            .checked_add(len)
2054            .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
2055        let reading_bytes = bytes
2056            .get(cursor..end)
2057            .ok_or(UnidicArtifactPayloadError::TruncatedIndexed { field: "reading" })?;
2058        let reading = std::str::from_utf8(reading_bytes)
2059            .map_err(|source| UnidicArtifactPayloadError::InvalidIndexedUtf8 {
2060                field: "reading",
2061                source,
2062            })?
2063            .to_string();
2064        readings.push(reading);
2065        cursor = end;
2066    }
2067    Ok(readings)
2068}
2069
2070/// Computes the SHA-256 file digest string for a UniDic artifact payload file.
2071pub fn artifact_file_digest_path(path: impl AsRef<Path>) -> Result<String, std::io::Error> {
2072    let file = File::open(path)?;
2073    artifact_file_digest_reader(file)
2074}
2075
2076/// Computes the SHA-256 file digest string from a reader.
2077pub fn artifact_file_digest_reader(mut reader: impl Read) -> Result<String, std::io::Error> {
2078    let mut hasher = Sha256::new();
2079    let mut buffer = [0_u8; 64 * 1024];
2080    loop {
2081        let read = reader.read(&mut buffer)?;
2082        if read == 0 {
2083            break;
2084        }
2085        hasher.update(&buffer[..read]);
2086    }
2087    Ok(sha256_digest_hex(hasher.finalize()))
2088}
2089
2090fn validate_artifact_payload_header(
2091    payload: &UnidicReadingIndexPayload,
2092) -> Result<(), UnidicArtifactPayloadError> {
2093    if payload.schema_version != ARTIFACT_PAYLOAD_SCHEMA_VERSION {
2094        return Err(UnidicArtifactPayloadError::UnsupportedSchemaVersion {
2095            version: payload.schema_version,
2096        });
2097    }
2098    if payload.payload_type != ARTIFACT_PAYLOAD_TYPE {
2099        return Err(UnidicArtifactPayloadError::UnsupportedPayloadType {
2100            payload_type: payload.payload_type.clone(),
2101        });
2102    }
2103    Ok(())
2104}
2105
2106fn canonical_payload_bytes(payload: &UnidicReadingIndexPayload) -> Vec<u8> {
2107    let mut bytes = Vec::new();
2108    bytes.extend_from_slice(b"moine.unidic.reading-index.surface-readings/v1\n");
2109    for entry in &payload.entries {
2110        push_len_prefixed(&mut bytes, b"S", &entry.surface);
2111        bytes.extend_from_slice(format!("R{}\n", entry.readings.len()).as_bytes());
2112        for reading in &entry.readings {
2113            push_len_prefixed(&mut bytes, b"r", reading);
2114        }
2115    }
2116    bytes
2117}
2118
2119fn push_len_prefixed(bytes: &mut Vec<u8>, tag: &[u8], value: &str) {
2120    bytes.extend_from_slice(tag);
2121    bytes.extend_from_slice(value.len().to_string().as_bytes());
2122    bytes.push(b'\n');
2123    bytes.extend_from_slice(value.as_bytes());
2124    bytes.push(b'\n');
2125}
2126
2127fn fnv1a64(bytes: &[u8]) -> u64 {
2128    let mut hash = 0xcbf29ce484222325_u64;
2129    for byte in bytes {
2130        hash ^= u64::from(*byte);
2131        hash = hash.wrapping_mul(0x100000001b3);
2132    }
2133    hash
2134}
2135
2136fn sha256_hex(bytes: &[u8]) -> String {
2137    sha256_digest_hex(Sha256::digest(bytes))
2138}
2139
2140fn sha256_digest_hex(digest: impl IntoIterator<Item = u8>) -> String {
2141    let mut output = String::with_capacity(64);
2142    for byte in digest {
2143        write!(&mut output, "{byte:02x}").expect("writing to String should not fail");
2144    }
2145    output
2146}
2147
2148#[cfg(test)]
2149mod tests {
2150    use super::*;
2151
2152    #[test]
2153    fn builds_surface_to_readings_index() {
2154        let csv = "\
2155印刷,18331,19434,9138,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢,*,*,*,*,*,*,体,インサツ,インサツ,インサツ,インサツ,0,C2,*,752349454934528,2737
2156刃,18521,20041,11551,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和,ハ濁,基本形,*,*,*,*,体,ハ,ハ,ハ,ハ,1,C3,*,8060803244761600,29325
2157刃,18419,19578,12664,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和,*,*,*,*,*,*,体,ヤイバ,ヤイバ,ヤイバ,ヤイバ,\"1,0\",C1,*,18677687522566656,67949
2158";
2159        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2160
2161        assert_eq!(
2162            index.readings("印刷").as_deref(),
2163            Some(&["インサツ".to_string()][..])
2164        );
2165        assert_eq!(
2166            index.readings("刃").as_deref(),
2167            Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2168        );
2169    }
2170
2171    #[test]
2172    fn skips_star_readings() {
2173        let csv = "記号,1,2,3,補助記号,一般,*,*,*,*,*,記号,記号,*,記号,*,記号\n";
2174        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2175
2176        assert!(index.is_empty());
2177    }
2178
2179    #[test]
2180    fn excludes_ascii_and_symbol_surfaces_by_default() {
2181        let csv = "\
2182a,1,2,3,記号,文字,*,*,*,*,エー,a,a,エー,a,エー,外
2183!,1,2,3,補助記号,一般,*,*,*,*,!,!,!,!,!,!,記号
2184印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2185";
2186        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2187
2188        assert_eq!(index.readings("a"), None);
2189        assert_eq!(index.readings("!"), None);
2190        assert_eq!(
2191            index.readings("印刷").as_deref(),
2192            Some(&["インサツ".to_string()][..])
2193        );
2194    }
2195
2196    #[test]
2197    fn can_keep_ascii_surfaces_when_requested() {
2198        let csv = "a,1,2,3,名詞,普通名詞,一般,*,*,*,エー,a,a,エー,a,エー,外\n";
2199        let index = UnidicReadingIndex::from_lex_csv_reader_with_options(
2200            csv.as_bytes(),
2201            UnidicIndexOptions {
2202                exclude_ascii_surfaces: false,
2203                ..UnidicIndexOptions::default()
2204            },
2205        )
2206        .unwrap();
2207
2208        assert_eq!(
2209            index.readings("a").as_deref(),
2210            Some(&["エー".to_string()][..])
2211        );
2212    }
2213
2214    #[test]
2215    fn limits_readings_per_surface_when_requested() {
2216        let csv = "\
2217刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2218刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2219刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2220";
2221        let index = UnidicReadingIndex::from_lex_csv_reader_with_options(
2222            csv.as_bytes(),
2223            UnidicIndexOptions {
2224                max_readings_per_surface: Some(2),
2225                ..UnidicIndexOptions::default()
2226            },
2227        )
2228        .unwrap();
2229
2230        assert_eq!(
2231            index.readings("刃").as_deref(),
2232            Some(&["ジン".to_string(), "ハ".to_string()][..])
2233        );
2234    }
2235
2236    #[test]
2237    fn can_limit_readings_per_segment_at_query_time() {
2238        let csv = "\
2239刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2240刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2241刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2242";
2243        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2244        let readings = index.reading_sequences(
2245            "刃",
2246            DictionaryReadingOptions {
2247                max_readings_per_segment: Some(2),
2248                ..DictionaryReadingOptions::default()
2249            },
2250        );
2251
2252        assert_eq!(readings, vec!["ジン".to_string(), "ハ".to_string()]);
2253    }
2254
2255    #[test]
2256    fn builds_artifact_metadata_from_index_and_options() {
2257        let csv = "\
2258刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2259刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2260";
2261        let index_options = UnidicIndexOptions {
2262            reading_field: UnidicReadingField::Pron,
2263            max_readings_per_surface: Some(1),
2264            exclude_ascii_surfaces: true,
2265            exclude_symbol_pos: true,
2266        };
2267        let index =
2268            UnidicReadingIndex::from_lex_csv_reader_with_options(csv.as_bytes(), index_options)
2269                .unwrap();
2270
2271        let metadata = index.artifact_metadata(UnidicArtifactMetadataOptions {
2272            artifact_name: "moine-unidic-cwj-202512".to_string(),
2273            generator: "moine-cli".to_string(),
2274            payload_file_name: "moine-unidic-cwj-202512.readings.yaml".to_string(),
2275            payload_format: "yaml.surface-readings.v1".to_string(),
2276            source_name: "UniDic-CWJ".to_string(),
2277            source_version: "2025.12".to_string(),
2278            source_lex_csv: "unidic-cwj-202512_full/lex.csv".to_string(),
2279            index_options,
2280            query_defaults: DictionaryReadingOptions {
2281                longest_match_only: true,
2282                max_readings_per_segment: Some(16),
2283                ..DictionaryReadingOptions::default()
2284            },
2285            license: UnidicArtifactLicense::default(),
2286        });
2287
2288        assert_eq!(metadata.schema_version, 1);
2289        assert_eq!(metadata.artifact_type, "moine.unidic.reading-index");
2290        assert_eq!(
2291            metadata.payload.path,
2292            "moine-unidic-cwj-202512.readings.yaml"
2293        );
2294        assert_eq!(metadata.payload.format, "yaml.surface-readings.v1");
2295        assert_eq!(
2296            metadata.payload.checksum_algorithm,
2297            ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM
2298        );
2299        assert_eq!(metadata.payload.checksum.len(), 64);
2300        assert_eq!(metadata.source.version, "2025.12");
2301        assert_eq!(metadata.build.reading_field, "pron");
2302        assert_eq!(metadata.build.entries, 1);
2303        assert_eq!(metadata.build.max_readings_per_surface, Some(1));
2304        assert!(metadata.query_defaults.longest_match_only);
2305        assert_eq!(metadata.query_defaults.max_readings_per_segment, Some(16));
2306        assert_eq!(metadata.license.selected_license, "BSD-3-Clause");
2307    }
2308
2309    #[test]
2310    fn builds_deterministic_payload_entries() {
2311        let csv = "\
2312刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2313印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2314刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2315";
2316        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2317        let payload = index.artifact_payload();
2318
2319        assert_eq!(payload.schema_version, 1);
2320        assert_eq!(
2321            payload.payload_type,
2322            "moine.unidic.reading-index.surface-readings"
2323        );
2324        assert_eq!(
2325            payload.entries,
2326            vec![
2327                UnidicReadingIndexPayloadEntry {
2328                    surface: "刃".to_string(),
2329                    readings: vec!["ハ".to_string(), "ヤイバ".to_string()],
2330                },
2331                UnidicReadingIndexPayloadEntry {
2332                    surface: "印刷".to_string(),
2333                    readings: vec!["インサツ".to_string()],
2334                },
2335            ]
2336        );
2337    }
2338
2339    #[test]
2340    fn payload_checksum_changes_with_payload_content() {
2341        let first = UnidicReadingIndex::from_lex_csv_reader(
2342            "刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和\n".as_bytes(),
2343        )
2344        .unwrap();
2345        let second = UnidicReadingIndex::from_lex_csv_reader(
2346            "刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和\n".as_bytes(),
2347        )
2348        .unwrap();
2349
2350        assert_eq!(first.artifact_payload_checksum().len(), 64);
2351        assert_eq!(
2352            first.artifact_payload_checksum(),
2353            first
2354                .artifact_payload_checksum_for_algorithm(ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
2355                .unwrap()
2356        );
2357        assert_eq!(
2358            first
2359                .artifact_payload_checksum_for_algorithm(LEGACY_ARTIFACT_PAYLOAD_CHECKSUM_ALGORITHM)
2360                .unwrap()
2361                .len(),
2362            16
2363        );
2364        assert_ne!(
2365            first.artifact_payload_checksum(),
2366            second.artifact_payload_checksum()
2367        );
2368    }
2369
2370    #[test]
2371    fn loads_artifact_payload_back_into_index() {
2372        let payload = UnidicReadingIndexPayload {
2373            schema_version: 1,
2374            payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2375            entries: vec![UnidicReadingIndexPayloadEntry {
2376                surface: "印刷".to_string(),
2377                readings: vec!["インサツ".to_string()],
2378            }],
2379        };
2380
2381        let index = UnidicReadingIndex::from_artifact_payload(payload).unwrap();
2382
2383        assert_eq!(index.len(), 1);
2384        assert_eq!(
2385            index.readings("印刷").as_deref(),
2386            Some(&["インサツ".to_string()][..])
2387        );
2388    }
2389
2390    #[test]
2391    fn loads_artifact_payload_reader() {
2392        let yaml = "\
2393schema_version: 1
2394payload_type: moine.unidic.reading-index.surface-readings
2395entries:
2396- surface: 刃
2397  readings:
2398  - ハ
2399  - ヤイバ
2400";
2401
2402        let index = UnidicReadingIndex::from_artifact_payload_reader(yaml.as_bytes()).unwrap();
2403
2404        assert_eq!(
2405            index.readings("刃").as_deref(),
2406            Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2407        );
2408    }
2409
2410    #[test]
2411    fn binary_artifact_payload_round_trips_to_equivalent_index() {
2412        let csv = "\
2413刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2414刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2415印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2416";
2417        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2418        let mut bytes = Vec::new();
2419
2420        index.write_artifact_binary_payload(&mut bytes).unwrap();
2421        let loaded = UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice())
2422            .expect("binary payload should load");
2423        let header = UnidicReadingIndex::binary_artifact_payload_header_reader(bytes.as_slice())
2424            .expect("binary payload header should load");
2425
2426        assert_eq!(
2427            header,
2428            UnidicBinaryArtifactPayloadHeader {
2429                version: 1,
2430                entries: 2,
2431            }
2432        );
2433        assert_eq!(loaded.artifact_payload(), index.artifact_payload());
2434        assert_eq!(
2435            loaded.artifact_payload_checksum(),
2436            index.artifact_payload_checksum()
2437        );
2438    }
2439
2440    #[test]
2441    fn indexed_artifact_payload_round_trips_and_supports_lookup() {
2442        let csv = "\
2443刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2444刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2445印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2446";
2447        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2448        let mut bytes = Vec::new();
2449        index.write_indexed_artifact_payload(&mut bytes).unwrap();
2450
2451        let unique = std::time::SystemTime::now()
2452            .duration_since(std::time::UNIX_EPOCH)
2453            .unwrap()
2454            .as_nanos();
2455        let path = std::env::temp_dir().join(format!(
2456            "moine-indexed-test-{}-{}.moineidx",
2457            std::process::id(),
2458            unique
2459        ));
2460        std::fs::write(&path, &bytes).unwrap();
2461        let loaded = UnidicReadingIndex::from_indexed_artifact_payload_path(&path)
2462            .expect("indexed payload should load");
2463        let _ = std::fs::remove_file(&path);
2464        let loaded_from_bytes = UnidicReadingIndex::from_indexed_artifact_payload_bytes(&bytes)
2465            .expect("indexed payload bytes should load");
2466
2467        assert_eq!(loaded.len(), 2);
2468        assert_eq!(
2469            loaded.readings("刃").as_deref(),
2470            Some(&["ハ".to_string(), "ヤイバ".to_string()][..])
2471        );
2472        assert_eq!(
2473            loaded_from_bytes.artifact_payload(),
2474            index.artifact_payload()
2475        );
2476        assert_eq!(loaded.artifact_payload(), index.artifact_payload());
2477        assert_eq!(
2478            loaded.artifact_payload_checksum(),
2479            index.artifact_payload_checksum()
2480        );
2481        assert_eq!(
2482            loaded.reading_sequences("印刷", DictionaryReadingOptions::default()),
2483            vec!["インサツ".to_string()]
2484        );
2485    }
2486
2487    #[test]
2488    fn binary_artifact_payload_uses_stable_little_endian_layout() {
2489        let csv = "\
2490刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2491刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2492印刷,1,2,3,名詞,普通名詞,サ変可能,*,*,*,インサツ,印刷,印刷,インサツ,印刷,インサツ,漢
2493";
2494        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2495        let mut bytes = Vec::new();
2496
2497        index.write_artifact_binary_payload(&mut bytes).unwrap();
2498
2499        #[rustfmt::skip]
2500        let expected = vec![
2501            b'M', b'O', b'I', b'N', b'E', b'U', b'0', b'1',
2502            1, 0, 0, 0,
2503            0, 0, 0, 0,
2504            2, 0, 0, 0, 0, 0, 0, 0,
2505            3, 0, 0, 0, 0xe5, 0x88, 0x83,
2506            2, 0, 0, 0,
2507            3, 0, 0, 0, 0xe3, 0x83, 0x8f,
2508            9, 0, 0, 0, 0xe3, 0x83, 0xa4, 0xe3, 0x82, 0xa4, 0xe3, 0x83, 0x90,
2509            6, 0, 0, 0, 0xe5, 0x8d, 0xb0, 0xe5, 0x88, 0xb7,
2510            1, 0, 0, 0,
2511            12, 0, 0, 0, 0xe3, 0x82, 0xa4, 0xe3, 0x83, 0xb3, 0xe3, 0x82, 0xb5, 0xe3, 0x83, 0x84,
2512        ];
2513        assert_eq!(bytes, expected);
2514    }
2515
2516    #[test]
2517    fn rejects_binary_artifact_bad_magic() {
2518        let bytes = *b"NOTMOINE";
2519        let err =
2520            UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2521
2522        assert!(matches!(
2523            err,
2524            UnidicArtifactPayloadError::InvalidBinaryMagic { .. }
2525        ));
2526    }
2527
2528    #[test]
2529    fn rejects_binary_artifact_unsupported_version() {
2530        let mut bytes = Vec::new();
2531        bytes.extend_from_slice(b"MOINEU01");
2532        bytes.extend_from_slice(&2_u32.to_le_bytes());
2533        bytes.extend_from_slice(&0_u32.to_le_bytes());
2534        bytes.extend_from_slice(&0_u64.to_le_bytes());
2535
2536        let err =
2537            UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2538
2539        assert!(matches!(
2540            err,
2541            UnidicArtifactPayloadError::UnsupportedBinaryVersion { version: 2 }
2542        ));
2543    }
2544
2545    #[test]
2546    fn rejects_binary_artifact_truncated_string() {
2547        let mut bytes = Vec::new();
2548        bytes.extend_from_slice(b"MOINEU01");
2549        bytes.extend_from_slice(&1_u32.to_le_bytes());
2550        bytes.extend_from_slice(&0_u32.to_le_bytes());
2551        bytes.extend_from_slice(&1_u64.to_le_bytes());
2552        bytes.extend_from_slice(&4_u32.to_le_bytes());
2553        bytes.extend_from_slice("刃".as_bytes());
2554
2555        let err =
2556            UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2557
2558        assert!(matches!(
2559            err,
2560            UnidicArtifactPayloadError::TruncatedBinary { field: "surface" }
2561        ));
2562    }
2563
2564    #[test]
2565    fn rejects_binary_artifact_invalid_utf8() {
2566        let mut bytes = Vec::new();
2567        bytes.extend_from_slice(b"MOINEU01");
2568        bytes.extend_from_slice(&1_u32.to_le_bytes());
2569        bytes.extend_from_slice(&0_u32.to_le_bytes());
2570        bytes.extend_from_slice(&1_u64.to_le_bytes());
2571        bytes.extend_from_slice(&1_u32.to_le_bytes());
2572        bytes.push(0xff);
2573        bytes.extend_from_slice(&0_u32.to_le_bytes());
2574
2575        let err =
2576            UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2577
2578        assert!(matches!(
2579            err,
2580            UnidicArtifactPayloadError::InvalidBinaryUtf8 {
2581                field: "surface",
2582                ..
2583            }
2584        ));
2585    }
2586
2587    #[test]
2588    fn rejects_binary_artifact_excessive_entry_count() {
2589        let mut bytes = Vec::new();
2590        bytes.extend_from_slice(b"MOINEU01");
2591        bytes.extend_from_slice(&1_u32.to_le_bytes());
2592        bytes.extend_from_slice(&0_u32.to_le_bytes());
2593        bytes.extend_from_slice(&((MAX_ARTIFACT_ENTRIES as u64) + 1).to_le_bytes());
2594
2595        let err =
2596            UnidicReadingIndex::from_binary_artifact_payload_reader(bytes.as_slice()).unwrap_err();
2597
2598        assert!(matches!(
2599            err,
2600            UnidicArtifactPayloadError::ArtifactLimitExceeded {
2601                field: "entry_count",
2602                ..
2603            }
2604        ));
2605    }
2606
2607    #[test]
2608    fn rejects_artifact_payload_duplicate_surfaces() {
2609        let payload = UnidicReadingIndexPayload {
2610            schema_version: 1,
2611            payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2612            entries: vec![
2613                UnidicReadingIndexPayloadEntry {
2614                    surface: "刃".to_string(),
2615                    readings: vec!["ハ".to_string()],
2616                },
2617                UnidicReadingIndexPayloadEntry {
2618                    surface: "刃".to_string(),
2619                    readings: vec!["ヤイバ".to_string()],
2620                },
2621            ],
2622        };
2623
2624        let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2625
2626        assert!(matches!(
2627            err,
2628            UnidicArtifactPayloadError::DuplicateSurface { surface } if surface == "刃"
2629        ));
2630    }
2631
2632    #[test]
2633    fn rejects_artifact_payload_duplicate_readings() {
2634        let payload = UnidicReadingIndexPayload {
2635            schema_version: 1,
2636            payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2637            entries: vec![UnidicReadingIndexPayloadEntry {
2638                surface: "刃".to_string(),
2639                readings: vec!["ハ".to_string(), "ハ".to_string()],
2640            }],
2641        };
2642
2643        let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2644
2645        assert!(matches!(
2646            err,
2647            UnidicArtifactPayloadError::DuplicateReading { surface, reading }
2648                if surface == "刃" && reading == "ハ"
2649        ));
2650    }
2651
2652    #[test]
2653    fn rejects_artifact_payload_excessive_reading_count() {
2654        let payload = UnidicReadingIndexPayload {
2655            schema_version: 1,
2656            payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2657            entries: vec![UnidicReadingIndexPayloadEntry {
2658                surface: "刃".to_string(),
2659                readings: vec!["ハ".to_string(); MAX_ARTIFACT_READINGS_PER_ENTRY + 1],
2660            }],
2661        };
2662
2663        let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2664
2665        assert!(matches!(
2666            err,
2667            UnidicArtifactPayloadError::ArtifactLimitExceeded {
2668                field: "reading_count",
2669                ..
2670            }
2671        ));
2672    }
2673
2674    #[test]
2675    fn rejects_artifact_payload_schema_mismatch() {
2676        let payload = UnidicReadingIndexPayload {
2677            schema_version: 2,
2678            payload_type: "moine.unidic.reading-index.surface-readings".to_string(),
2679            entries: Vec::new(),
2680        };
2681
2682        let err = UnidicReadingIndex::from_artifact_payload(payload).unwrap_err();
2683
2684        assert!(matches!(
2685            err,
2686            UnidicArtifactPayloadError::UnsupportedSchemaVersion { version: 2 }
2687        ));
2688    }
2689
2690    #[test]
2691    fn reports_reading_expansion_stats() {
2692        let csv = "\
2693刃,1,2,3,名詞,普通名詞,一般,*,*,*,ジン,刃,刃,ジン,刃,ジン,漢
2694刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2695刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2696";
2697        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2698        let expansion = index.reading_paths_with_stats(
2699            "刃",
2700            DictionaryReadingOptions {
2701                max_readings_per_segment: Some(2),
2702                ..DictionaryReadingOptions::default()
2703            },
2704        );
2705
2706        assert_eq!(expansion.paths.len(), 2);
2707        assert_eq!(
2708            expansion.stats,
2709            DictionaryReadingStats {
2710                matched_spans: 1,
2711                direct_fallback_spans: 0,
2712                longest_match_pruned_spans: 0,
2713                raw_segment_readings: 3,
2714                used_segment_readings: 2,
2715                pruned_segment_readings: 1,
2716                candidate_combinations: 2,
2717                unique_paths: 2,
2718                duplicate_joined_readings: 0,
2719                max_paths_hit_count: 0,
2720            }
2721        );
2722    }
2723
2724    #[test]
2725    fn reports_longest_match_and_path_limit_stats() {
2726        let csv = "\
2727茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2728道,1,2,3,名詞,普通名詞,一般,*,*,*,ミチ,道,道,ミチ,道,ミチ,和
2729道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2730具,1,2,3,名詞,普通名詞,一般,*,*,*,グ,具,具,グ,具,グ,和
2731";
2732        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2733        let expansion = index.reading_paths_with_stats(
2734            "茶道具",
2735            DictionaryReadingOptions {
2736                longest_match_only: true,
2737                max_paths: 1,
2738                ..DictionaryReadingOptions::default()
2739            },
2740        );
2741
2742        assert_eq!(expansion.paths.len(), 1);
2743        assert!(expansion.stats.longest_match_pruned_spans > 0);
2744        assert!(expansion.stats.max_paths_hit_count > 0);
2745    }
2746
2747    #[test]
2748    fn hybrid_reading_paths_use_direct_fallback_for_kana_ascii_spans() {
2749        let csv = "\
2750印,1,2,3,名詞,普通名詞,一般,*,*,*,イン,印,印,イン,印,イン,漢
2751";
2752        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2753        let expansion =
2754            index.hybrid_reading_paths_with_stats("印さt", DictionaryReadingOptions::default());
2755
2756        assert_eq!(
2757            expansion.paths,
2758            vec![DictionaryReadingPath {
2759                joined_reading: "インさt".to_string(),
2760                segments: vec![
2761                    DictionaryReadingSegment {
2762                        surface: "印".to_string(),
2763                        reading: "イン".to_string(),
2764                    },
2765                    DictionaryReadingSegment {
2766                        surface: "さt".to_string(),
2767                        reading: "さt".to_string(),
2768                    },
2769                ],
2770            }]
2771        );
2772        assert_eq!(expansion.stats.direct_fallback_spans, 2);
2773    }
2774
2775    #[test]
2776    fn hybrid_reading_paths_keep_shorter_dictionary_spans_for_direct_tail() {
2777        let csv = "\
2778印,1,2,3,名詞,普通名詞,一般,*,*,*,イン,印,印,イン,印,イン,漢
2779印さ,1,2,3,動詞,一般,*,*,*,*,シルス,印す,印す,シルス,印す,シルス,和
2780";
2781        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2782        let expansion = index.hybrid_reading_paths_with_stats(
2783            "印さt",
2784            DictionaryReadingOptions {
2785                longest_match_only: true,
2786                ..DictionaryReadingOptions::default()
2787            },
2788        );
2789
2790        assert!(expansion
2791            .paths
2792            .iter()
2793            .any(|path| path.joined_reading == "インさt"));
2794        assert_eq!(expansion.stats.longest_match_pruned_spans, 0);
2795    }
2796
2797    #[test]
2798    fn hybrid_reading_paths_still_reject_uncovered_kanji() {
2799        let index = UnidicReadingIndex::default();
2800        let expansion =
2801            index.hybrid_reading_paths_with_stats("未知z", DictionaryReadingOptions::default());
2802
2803        assert!(expansion.paths.is_empty());
2804        assert_eq!(expansion.stats.direct_fallback_spans, 1);
2805    }
2806
2807    #[test]
2808    fn can_use_pron_instead_of_lform() {
2809        let csv = "\
2810刃,18521,20041,11551,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和,ハ濁,基本形,*,*,*,*,体,ハ,ハ,ハ,ハ,1,C3,*,8060803244761600,29325
2811刃,18521,20055,14836,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,バ,刃,バ,和,ハ濁,濁音形,*,*,*,*,体,バ,バ,バ,ハ,1,C3,*,8060803244769792,29325
2812";
2813        let index = UnidicReadingIndex::from_lex_csv_reader_with_field(
2814            csv.as_bytes(),
2815            UnidicReadingField::Pron,
2816        )
2817        .unwrap();
2818
2819        assert_eq!(
2820            index.readings("刃").as_deref(),
2821            Some(&["ハ".to_string(), "バ".to_string()][..])
2822        );
2823    }
2824
2825    #[test]
2826    fn builds_reading_sequences_from_dictionary_segments() {
2827        let csv = "\
2828鬼滅,1,2,3,名詞,普通名詞,一般,*,*,*,キメツ,鬼滅,鬼滅,キメツ,鬼滅,キメツ,固
2829の,1,2,3,助詞,格助詞,*,*,*,*,ノ,の,の,ノ,の,ノ,和
2830刃,1,2,3,名詞,普通名詞,一般,*,*,*,ハ,刃,刃,ハ,刃,ハ,和
2831刃,1,2,3,名詞,普通名詞,一般,*,*,*,ヤイバ,刃,刃,ヤイバ,刃,ヤイバ,和
2832";
2833        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2834        let readings = index.reading_sequences("鬼滅の刃", DictionaryReadingOptions::default());
2835
2836        assert_eq!(
2837            readings,
2838            vec!["キメツノハ".to_string(), "キメツノヤイバ".to_string()]
2839        );
2840    }
2841
2842    #[test]
2843    fn reading_paths_keep_segmentation_and_segment_readings() {
2844        let csv = "\
2845茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2846道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,漢
2847";
2848        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2849        let paths = index.reading_paths(
2850            "茶道具",
2851            DictionaryReadingOptions {
2852                longest_match_only: true,
2853                ..DictionaryReadingOptions::default()
2854            },
2855        );
2856
2857        assert_eq!(
2858            paths,
2859            vec![DictionaryReadingPath {
2860                joined_reading: "チャドウグ".to_string(),
2861                segments: vec![
2862                    DictionaryReadingSegment {
2863                        surface: "茶".to_string(),
2864                        reading: "チャ".to_string(),
2865                    },
2866                    DictionaryReadingSegment {
2867                        surface: "道具".to_string(),
2868                        reading: "ドウグ".to_string(),
2869                    },
2870                ],
2871            }]
2872        );
2873    }
2874
2875    #[test]
2876    fn builds_romaji_lattice_from_dictionary_segments() {
2877        let csv = "\
2878茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2879道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2880";
2881        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2882        let lattice = index
2883            .romaji_lattice("茶道具", DictionaryReadingOptions::default())
2884            .unwrap()
2885            .unwrap();
2886
2887        assert_eq!(
2888            moine_core::distance(&lattice, &Lattice::from_paths(["chadougu"])),
2889            0
2890        );
2891    }
2892
2893    #[test]
2894    fn builds_romaji_lattice_directly_from_reading_paths() {
2895        let paths = vec![
2896            DictionaryReadingPath {
2897                joined_reading: "チャドウグ".to_string(),
2898                segments: vec![
2899                    DictionaryReadingSegment {
2900                        surface: "茶".to_string(),
2901                        reading: "チャ".to_string(),
2902                    },
2903                    DictionaryReadingSegment {
2904                        surface: "道具".to_string(),
2905                        reading: "ドウグ".to_string(),
2906                    },
2907                ],
2908            },
2909            DictionaryReadingPath {
2910                joined_reading: "チャドーグ".to_string(),
2911                segments: vec![
2912                    DictionaryReadingSegment {
2913                        surface: "茶".to_string(),
2914                        reading: "チャ".to_string(),
2915                    },
2916                    DictionaryReadingSegment {
2917                        surface: "道具".to_string(),
2918                        reading: "ドーグ".to_string(),
2919                    },
2920                ],
2921            },
2922        ];
2923        let lattice = romaji_lattice_from_reading_paths(&paths).unwrap();
2924
2925        assert_eq!(
2926            moine_core::distance(&lattice, &Lattice::from_paths(["chadougu"])),
2927            0
2928        );
2929        assert_eq!(
2930            moine_core::distance(&lattice, &Lattice::from_paths(["chadoogu"])),
2931            0
2932        );
2933    }
2934
2935    #[test]
2936    fn structured_reading_paths_keep_cross_segment_context() {
2937        let paths = vec![DictionaryReadingPath {
2938            joined_reading: "マッチャ".to_string(),
2939            segments: vec![
2940                DictionaryReadingSegment {
2941                    surface: "抹".to_string(),
2942                    reading: "マッ".to_string(),
2943                },
2944                DictionaryReadingSegment {
2945                    surface: "茶".to_string(),
2946                    reading: "チャ".to_string(),
2947                },
2948            ],
2949        }];
2950        let lattice = romaji_lattice_from_reading_paths(&paths).unwrap();
2951
2952        assert_eq!(
2953            moine_core::distance(&lattice, &Lattice::from_paths(["maccha"])),
2954            0
2955        );
2956        assert_eq!(
2957            moine_core::distance(&lattice, &Lattice::from_paths(["mattya"])),
2958            0
2959        );
2960    }
2961
2962    #[test]
2963    fn can_restrict_reading_sequences_to_longest_matches() {
2964        let csv = "\
2965茶,1,2,3,名詞,普通名詞,一般,*,*,*,チャ,茶,茶,チャ,茶,チャ,和
2966道,1,2,3,名詞,普通名詞,一般,*,*,*,ミチ,道,道,ミチ,道,ミチ,和
2967道具,1,2,3,名詞,普通名詞,一般,*,*,*,ドウグ,道具,道具,ドーグ,道具,ドーグ,和
2968具,1,2,3,名詞,普通名詞,一般,*,*,*,グ,具,具,グ,具,グ,和
2969";
2970        let index = UnidicReadingIndex::from_lex_csv_reader(csv.as_bytes()).unwrap();
2971        let readings = index.reading_sequences(
2972            "茶道具",
2973            DictionaryReadingOptions {
2974                longest_match_only: true,
2975                ..DictionaryReadingOptions::default()
2976            },
2977        );
2978
2979        assert_eq!(readings, vec!["チャドウグ".to_string()]);
2980    }
2981}