aozora_encoding/
lib.rs

1//! Encoding utilities for Aozora Bunko source material.
2//!
3//! The `aozora` parser itself is strictly UTF-8. Anything that decodes `Shift_JIS` or
4//! resolves gaiji (外字) mappings lives here, so the parser stays free of encoding
5//! concerns and the same logic is available to CLI, editor integrations, or
6//! downstream tools.
7
8#![forbid(unsafe_code)]
9
10use std::borrow::Cow;
11use std::str::from_utf8;
12
13use encoding_rs::{DecoderResult, SHIFT_JIS};
14use miette::Diagnostic;
15use thiserror::Error;
16
17/// Errors surfaced by the decode pipeline.
18#[derive(Debug, Error, Diagnostic)]
19#[non_exhaustive]
20pub enum DecodeError {
21    #[error("Shift_JIS からの変換に失敗しました (不正なバイト列)")]
22    #[diagnostic(code(aozora::encoding::sjis_invalid))]
23    ShiftJisInvalid,
24}
25
26/// Decode a `Shift_JIS` byte slice into UTF-8 (NFC normalisation is applied by the
27/// caller after decoding).
28///
29/// # Errors
30///
31/// Returns [`DecodeError::ShiftJisInvalid`] if `encoding_rs` reports a malformed byte
32/// sequence. Lossy replacement is deliberately not offered — callers need to know
33/// when they're looking at corrupted source material rather than silently absorbing
34/// the damage.
35///
36/// Allocates a fresh `String` per call. For workloads that decode many
37/// documents in succession, prefer [`decode_sjis_into`] with a reusable
38/// buffer to avoid the per-call allocation.
39pub fn decode_sjis(input: &[u8]) -> Result<String, DecodeError> {
40    let mut out = String::new();
41    decode_sjis_into(input, &mut out)?;
42    Ok(out)
43}
44
45/// Decode a `Shift_JIS` byte slice into the caller-owned `dst` buffer.
46///
47/// Pre-sizes `dst` exactly via
48/// `encoding_rs::Decoder::max_utf8_buffer_length_without_replacement`
49/// so the decode inner loop does no growth-realloc. The buffer is
50/// **not** cleared first — callers that want a fresh decode should
51/// `dst.clear()` before calling. This is intentional so the same
52/// buffer can be reused across many decodes in a thread-local /
53/// per-worker pool without paying the allocator per iteration.
54///
55/// Strict — same error contract as [`decode_sjis`]. Bypasses
56/// `encoding_rs`'s public `decode` shape, which always allocates a
57/// worst-case-sized `String` internally and `Cow::into_owned`s the
58/// result; this entry point goes straight through the
59/// `Decoder::decode_to_string_without_replacement` API the bench
60/// pipeline needs.
61///
62/// # Errors
63///
64/// Returns [`DecodeError::ShiftJisInvalid`] on malformed input or if
65/// the encoder reports overflow (which `max_utf8_buffer_length_…`
66/// should make unreachable, but is still surfaced rather than
67/// silently truncated).
68pub fn decode_sjis_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
69    let mut decoder = SHIFT_JIS.new_decoder_without_bom_handling();
70    let needed = decoder
71        .max_utf8_buffer_length_without_replacement(input.len())
72        .ok_or(DecodeError::ShiftJisInvalid)?;
73    dst.reserve(needed);
74    let (result, _read) = decoder.decode_to_string_without_replacement(input, dst, true);
75    match result {
76        DecoderResult::InputEmpty => Ok(()),
77        DecoderResult::Malformed(_, _) | DecoderResult::OutputFull => {
78            Err(DecodeError::ShiftJisInvalid)
79        }
80    }
81}
82
83/// Decode Aozora source bytes to UTF-8, detecting the encoding.
84///
85/// Aozora material reaches this crate in two shapes: the canonical
86/// `Shift_JIS` archive files, and already-decoded UTF-8 mirrors (e.g. a
87/// corpus that has been pre-converted). Forcing every caller to commit
88/// to one encoding up front is the wrong default — it is why corpus
89/// tooling historically hard-coded [`decode_sjis`] and silently broke on
90/// UTF-8 input. This entry point removes that concern: hand it bytes,
91/// get back UTF-8.
92///
93/// - Valid UTF-8 is returned **borrowed**, zero-copy.
94/// - Otherwise the bytes are decoded as `Shift_JIS` (owned).
95///
96/// UTF-8 is tried first on purpose. Valid UTF-8 is a near-unambiguous
97/// signal — `Shift_JIS` Japanese text essentially never forms a wholly
98/// valid UTF-8 sequence — whereas the converse does not hold: a UTF-8
99/// document can contain byte runs that decode as *some* `Shift_JIS`
100/// without erroring, so sniffing `Shift_JIS` first risks mojibake on
101/// UTF-8 input.
102///
103/// BOM stripping, CRLF folding and NFC normalisation are the parser's
104/// Phase-0 responsibility and are deliberately not applied here.
105///
106/// # Errors
107///
108/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
109/// valid UTF-8 nor valid `Shift_JIS`.
110pub fn decode_auto(input: &[u8]) -> Result<Cow<'_, str>, DecodeError> {
111    if let Ok(text) = from_utf8(input) {
112        return Ok(Cow::Borrowed(text));
113    }
114    decode_sjis(input).map(Cow::Owned)
115}
116
117/// Encoding-agnostic counterpart to [`decode_sjis_into`]: append the
118/// decoded UTF-8 to `dst`, detecting the source encoding.
119///
120/// Same sniffing rule as [`decode_auto`] (valid UTF-8 wins, else
121/// `Shift_JIS`), but writes into a caller-owned buffer so corpus
122/// loaders can reuse one allocation across many documents. The buffer
123/// is **not** cleared first — see [`decode_sjis_into`].
124///
125/// # Errors
126///
127/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
128/// valid UTF-8 nor valid `Shift_JIS`.
129pub fn decode_auto_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
130    if let Ok(text) = from_utf8(input) {
131        dst.push_str(text);
132        return Ok(());
133    }
134    decode_sjis_into(input, dst)
135}
136
137/// Whether the byte slice carries a UTF-8 BOM (`EF BB BF`).
138///
139/// Used by the CLI to strip the BOM before handing input to the parser.
140/// BOM presence is the one signal even [`decode_auto`] leaves to the
141/// caller: it is valid UTF-8, so it round-trips through `decode_auto`
142/// untouched and is stripped by the parser's Phase-0 sanitiser.
143#[must_use]
144pub const fn has_utf8_bom(input: &[u8]) -> bool {
145    matches!(input, [0xEF, 0xBB, 0xBF, ..])
146}
147
148pub mod gaiji;
149/// PHF tables (single, combo, description) emitted by `build.rs`
150/// at compile time via `phf_codegen`. Lives in `OUT_DIR` so it's
151/// regenerated automatically when any input TSV changes; the
152/// committed source tree carries only the data, not the perfect-
153/// hash output. See `build.rs` for the generator.
154#[allow(
155    clippy::unreadable_literal,
156    reason = "phf_codegen emits 64-bit perfect-hash keys without separators; \
157              we cannot reformat them without forking the codegen crate"
158)]
159mod jisx0213_table {
160    include!(concat!(env!("OUT_DIR"), "/jisx0213_table.rs"));
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    // ------------------------------------------------------------------
168    // SJIS happy-path decoding
169    // ------------------------------------------------------------------
170
171    #[test]
172    fn decodes_plain_ascii_sjis() {
173        assert_eq!(decode_sjis(b"hello").unwrap(), "hello");
174    }
175
176    #[test]
177    fn decodes_japanese_sjis() {
178        // 「青空文庫」 in Shift_JIS.
179        let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
180        assert_eq!(decode_sjis(bytes).unwrap(), "青空文庫");
181    }
182
183    #[test]
184    fn decodes_empty_input_to_empty_string() {
185        assert_eq!(decode_sjis(b"").unwrap(), "");
186    }
187
188    #[test]
189    fn decodes_ascii_control_characters_verbatim() {
190        // LF / CR / tab are 1:1 identity in SJIS since the lead byte
191        // range avoids ASCII. Exercising these locks in the pipeline
192        // doesn't mangle them before the sanitize pass.
193        assert_eq!(decode_sjis(b"a\nb\rc\td").unwrap(), "a\nb\rc\td");
194    }
195
196    #[test]
197    fn decodes_halfwidth_katakana() {
198        // Halfwidth katakana (0xA1..=0xDF) is a single byte each in SJIS.
199        // `ｱｲｳｴｵ` → bytes 0xB1..0xB5.
200        let bytes = &[0xB1, 0xB2, 0xB3, 0xB4, 0xB5];
201        assert_eq!(decode_sjis(bytes).unwrap(), "ｱｲｳｴｵ");
202    }
203
204    #[test]
205    fn decodes_mixed_ascii_and_kanji() {
206        // Common shape in Aozora corpora: explanatory text in ASCII
207        // mixed with Japanese quotations.
208        let mut bytes = Vec::from(*b"about ");
209        bytes.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); // 日本
210        bytes.extend_from_slice(b" !");
211        assert_eq!(decode_sjis(&bytes).unwrap(), "about 日本 !");
212    }
213
214    #[test]
215    fn decodes_hiragana_sjis() {
216        // 「こんにちは」 — lead bytes in the 0x82 range.
217        let bytes = &[
218            0x82, 0xB1, // こ
219            0x82, 0xF1, // ん
220            0x82, 0xC9, // に
221            0x82, 0xBF, // ち
222            0x82, 0xCD, // は
223        ];
224        assert_eq!(decode_sjis(bytes).unwrap(), "こんにちは");
225    }
226
227    #[test]
228    fn decodes_fullwidth_digits() {
229        // １２３ — fullwidth digits are common in Aozora ruby delimiters.
230        let bytes = &[0x82, 0x4F, 0x82, 0x50, 0x82, 0x51];
231        assert_eq!(decode_sjis(bytes).unwrap(), "０１２");
232    }
233
234    // ------------------------------------------------------------------
235    // decode_auto — encoding-agnostic entry point
236    // ------------------------------------------------------------------
237
238    #[test]
239    fn decode_auto_passes_utf8_through_borrowed() {
240        let bytes = "青空文庫".as_bytes();
241        let out = decode_auto(bytes).unwrap();
242        assert!(matches!(out, Cow::Borrowed(_)), "UTF-8 must be zero-copy");
243        assert_eq!(out, "青空文庫");
244    }
245
246    #[test]
247    fn decode_auto_falls_back_to_sjis_owned() {
248        // 「青空文庫」 in Shift_JIS — not valid UTF-8, so it decodes.
249        let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
250        let out = decode_auto(bytes).unwrap();
251        assert!(
252            matches!(out, Cow::Owned(_)),
253            "SJIS must be decoded to owned"
254        );
255        assert_eq!(out, "青空文庫");
256    }
257
258    #[test]
259    fn decode_auto_borrows_ascii() {
260        // ASCII is valid in both encodings; UTF-8-first means borrowed.
261        let out = decode_auto(b"hello").unwrap();
262        assert!(matches!(out, Cow::Borrowed(_)));
263        assert_eq!(out, "hello");
264    }
265
266    #[test]
267    fn decode_auto_prefers_utf8_over_ambiguous_sjis() {
268        // UTF-8 「日本語」 = E3 81 A5 ... whose leading bytes are *also*
269        // valid Shift_JIS lead bytes. Sniffing SJIS first would mojibake
270        // this; UTF-8-first returns the correct text, borrowed.
271        let bytes = "日本語".as_bytes();
272        let out = decode_auto(bytes).unwrap();
273        assert!(matches!(out, Cow::Borrowed(_)));
274        assert_eq!(out, "日本語");
275    }
276
277    #[test]
278    fn decode_auto_errors_when_neither_encoding_fits() {
279        // 0xFF is neither a valid UTF-8 byte nor an assigned Shift_JIS byte.
280        assert!(matches!(
281            decode_auto(&[0xFF, 0xFF]),
282            Err(DecodeError::ShiftJisInvalid)
283        ));
284    }
285
286    #[test]
287    fn decode_auto_empty_is_borrowed_empty() {
288        let out = decode_auto(b"").unwrap();
289        assert!(matches!(out, Cow::Borrowed(_)));
290        assert_eq!(out, "");
291    }
292
293    #[test]
294    fn decode_auto_into_appends_both_encodings() {
295        let mut buf = String::new();
296        decode_auto_into("青空".as_bytes(), &mut buf).unwrap(); // UTF-8
297        decode_auto_into(&[0x95, 0xB6, 0x8C, 0xC9], &mut buf).unwrap(); // 文庫 in SJIS
298        assert_eq!(buf, "青空文庫");
299    }
300
301    // ------------------------------------------------------------------
302    // decode_sjis_into — buffer-reuse path equivalence
303    // ------------------------------------------------------------------
304    //
305    // Every test below the section header verifies the contract that
306    // `decode_sjis(b) == decode_sjis_into(b, &mut buf)` byte-for-byte
307    // (and for the strict-error case, returns the same `Err`).
308    // `decode_sjis_into` is the buffer-reuse entry point used by the
309    // bench `parallel_size_bands` thread-local pool; the production
310    // `decode_sjis` is a thin wrapper that calls `decode_sjis_into`
311    // with a fresh `String`.
312
313    fn check_equivalent(input: &[u8]) {
314        let owned = decode_sjis(input);
315        let mut buf = String::new();
316        let into_result = decode_sjis_into(input, &mut buf);
317        match (owned, into_result) {
318            (Ok(s), Ok(())) => assert_eq!(s, buf, "decode_sjis output != decode_sjis_into output"),
319            (Err(_), Err(_)) => {} // both fail — identical strict error contract
320            (Ok(s), Err(e)) => panic!("owned succeeded ({s:?}) but _into failed ({e:?})"),
321            (Err(e), Ok(())) => panic!("owned failed ({e:?}) but _into succeeded ({buf:?})"),
322        }
323    }
324
325    #[test]
326    fn into_equivalent_on_ascii() {
327        check_equivalent(b"hello world");
328    }
329
330    #[test]
331    fn into_equivalent_on_japanese() {
332        check_equivalent(&[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9]);
333    }
334
335    #[test]
336    fn into_equivalent_on_empty() {
337        check_equivalent(b"");
338    }
339
340    #[test]
341    fn into_equivalent_on_halfwidth_katakana() {
342        check_equivalent(&[0xB1, 0xB2, 0xB3, 0xB4, 0xB5]);
343    }
344
345    #[test]
346    fn into_equivalent_on_invalid_lead_byte() {
347        check_equivalent(&[0xFF, 0xFF]);
348    }
349
350    #[test]
351    fn into_equivalent_on_lone_lead_byte() {
352        check_equivalent(&[b'o', b'k', 0x82]);
353    }
354
355    #[test]
356    fn into_reuses_buffer_capacity_across_calls() {
357        // The buffer-reuse contract: a `dst` String that already has
358        // enough capacity should not allocate again on the second
359        // decode. We verify this by asserting capacity is preserved
360        // across `clear() + decode_sjis_into` cycles. (Pinning the
361        // exact byte count would couple the test to bumpalo /
362        // encoding_rs internals; the load-bearing invariant is "no
363        // shrink".)
364        let mut buf = String::with_capacity(4096);
365        let cap_before = buf.capacity();
366        decode_sjis_into(b"hello", &mut buf).unwrap();
367        let cap_after_first = buf.capacity();
368        assert!(
369            cap_after_first >= cap_before,
370            "capacity must not shrink on small decode"
371        );
372        buf.clear();
373        decode_sjis_into(b"world", &mut buf).unwrap();
374        assert!(
375            buf.capacity() >= cap_after_first,
376            "capacity must not shrink on a buffer-reuse cycle"
377        );
378    }
379
380    #[test]
381    fn into_appends_when_dst_not_cleared() {
382        // Documented contract: callers must `clear()` before each
383        // decode if they want a fresh result. This test pins that
384        // shape so future "convenience clear inside the function"
385        // changes break loudly.
386        let mut buf = String::from("PRE:");
387        decode_sjis_into(b"hi", &mut buf).unwrap();
388        assert_eq!(buf, "PRE:hi");
389    }
390
391    // ------------------------------------------------------------------
392    // SJIS error surfaces
393    // ------------------------------------------------------------------
394
395    #[test]
396    fn rejects_invalid_lead_byte() {
397        let bytes = &[0xFF, 0xFF];
398        assert!(matches!(
399            decode_sjis(bytes),
400            Err(DecodeError::ShiftJisInvalid)
401        ));
402    }
403
404    #[test]
405    fn rejects_lone_lead_byte_at_end_of_input() {
406        // 0x82 alone is a truncated two-byte sequence (expects trail).
407        let bytes = &[b'o', b'k', 0x82];
408        assert!(matches!(
409            decode_sjis(bytes),
410            Err(DecodeError::ShiftJisInvalid)
411        ));
412    }
413
414    #[test]
415    fn rejects_invalid_trail_byte() {
416        // Lead 0x82 with an invalid trail 0x00 (trails must be 0x40..=0xFC, != 0x7F).
417        let bytes = &[0x82, 0x00];
418        assert!(matches!(
419            decode_sjis(bytes),
420            Err(DecodeError::ShiftJisInvalid)
421        ));
422    }
423
424    #[test]
425    fn error_message_is_japanese_and_carries_miette_code() {
426        // The project-wide rule is that user-facing errors are in
427        // Japanese. Pin that and the miette diagnostic code both.
428        let err = decode_sjis(&[0xFF, 0xFF]).unwrap_err();
429        let message = format!("{err}");
430        assert!(
431            message.contains("Shift_JIS"),
432            "error message must contain Shift_JIS for locatability, got {message:?}",
433        );
434    }
435
436    // ------------------------------------------------------------------
437    // UTF-8 BOM detection
438    // ------------------------------------------------------------------
439
440    #[test]
441    fn detects_utf8_bom() {
442        assert!(has_utf8_bom(b"\xEF\xBB\xBFtext"));
443    }
444
445    #[test]
446    fn no_utf8_bom_on_plain_input() {
447        assert!(!has_utf8_bom(b"text"));
448    }
449
450    #[test]
451    fn no_utf8_bom_on_shorter_than_bom() {
452        assert!(!has_utf8_bom(b"\xEF\xBB"));
453    }
454
455    #[test]
456    fn no_utf8_bom_on_empty_input() {
457        assert!(!has_utf8_bom(b""));
458    }
459
460    #[test]
461    fn detects_utf8_bom_on_exactly_three_bytes() {
462        // Boundary: the slice is exactly `EF BB BF` with no trailing
463        // content. `matches!` pattern with `..` rest binding accepts
464        // empty tails.
465        assert!(has_utf8_bom(&[0xEF, 0xBB, 0xBF]));
466    }
467
468    #[test]
469    fn bom_detection_rejects_near_misses() {
470        // Off-by-one patterns that are NOT the UTF-8 BOM.
471        assert!(!has_utf8_bom(&[0xEF, 0xBB, 0xBE])); // last byte wrong
472        assert!(!has_utf8_bom(&[0xEE, 0xBB, 0xBF])); // first byte wrong
473        assert!(!has_utf8_bom(&[0xEF, 0xBC, 0xBF])); // middle byte wrong
474        assert!(!has_utf8_bom(&[0xFE, 0xFF])); // UTF-16 BE BOM — not ours
475        assert!(!has_utf8_bom(&[0xFF, 0xFE])); // UTF-16 LE BOM — not ours
476    }
477
478    // ------------------------------------------------------------------
479    // Gaiji resolution (via primitive `gaiji::lookup`)
480    // ------------------------------------------------------------------
481
482    #[test]
483    fn gaiji_lookup_echoes_existing_ucs_when_set() {
484        assert_eq!(
485            gaiji::lookup(Some('吶'), Some("第3水準1-85-54"), "木＋吶のつくり"),
486            Some(gaiji::Resolved::Char('吶'))
487        );
488    }
489
490    #[test]
491    fn gaiji_lookup_returns_none_when_unresolvable() {
492        assert_eq!(gaiji::lookup(None, None, "第3水準1-85-54"), None);
493    }
494}
aozora_encoding/lib.rs

aozora_encoding/
lib.rs