aozora-encoding 0.4.1

Aozora Bunko notation: Shift_JIS decoding and gaiji (外字) resolution. Internal crate; depend on the aozora umbrella crate instead.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
//! Encoding utilities for Aozora Bunko source material.
//!
//! The `aozora` parser itself is strictly UTF-8. Anything that decodes `Shift_JIS` or
//! resolves gaiji (外字) mappings lives here, so the parser stays free of encoding
//! concerns and the same logic is available to CLI, editor integrations, or
//! downstream tools.

#![forbid(unsafe_code)]

use std::borrow::Cow;
use std::str::from_utf8;

use encoding_rs::{DecoderResult, SHIFT_JIS};
use miette::Diagnostic;
use thiserror::Error;

/// Errors surfaced by the decode pipeline.
#[derive(Debug, Error, Diagnostic)]
#[non_exhaustive]
pub enum DecodeError {
    #[error("Shift_JIS からの変換に失敗しました (不正なバイト列)")]
    #[diagnostic(code(aozora::encoding::sjis_invalid))]
    ShiftJisInvalid,
}

/// Decode a `Shift_JIS` byte slice into UTF-8 (NFC normalisation is applied by the
/// caller after decoding).
///
/// # Errors
///
/// Returns [`DecodeError::ShiftJisInvalid`] if `encoding_rs` reports a malformed byte
/// sequence. Lossy replacement is deliberately not offered — callers need to know
/// when they're looking at corrupted source material rather than silently absorbing
/// the damage.
///
/// Allocates a fresh `String` per call. For workloads that decode many
/// documents in succession, prefer [`decode_sjis_into`] with a reusable
/// buffer to avoid the per-call allocation.
pub fn decode_sjis(input: &[u8]) -> Result<String, DecodeError> {
    let mut out = String::new();
    decode_sjis_into(input, &mut out)?;
    Ok(out)
}

/// Decode a `Shift_JIS` byte slice into the caller-owned `dst` buffer.
///
/// Pre-sizes `dst` exactly via
/// `encoding_rs::Decoder::max_utf8_buffer_length_without_replacement`
/// so the decode inner loop does no growth-realloc. The buffer is
/// **not** cleared first — callers that want a fresh decode should
/// `dst.clear()` before calling. This is intentional so the same
/// buffer can be reused across many decodes in a thread-local /
/// per-worker pool without paying the allocator per iteration.
///
/// Strict — same error contract as [`decode_sjis`]. Bypasses
/// `encoding_rs`'s public `decode` shape, which always allocates a
/// worst-case-sized `String` internally and `Cow::into_owned`s the
/// result; this entry point goes straight through the
/// `Decoder::decode_to_string_without_replacement` API the bench
/// pipeline needs.
///
/// # Errors
///
/// Returns [`DecodeError::ShiftJisInvalid`] on malformed input or if
/// the encoder reports overflow (which `max_utf8_buffer_length_…`
/// should make unreachable, but is still surfaced rather than
/// silently truncated).
pub fn decode_sjis_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
    let mut decoder = SHIFT_JIS.new_decoder_without_bom_handling();
    let needed = decoder
        .max_utf8_buffer_length_without_replacement(input.len())
        .ok_or(DecodeError::ShiftJisInvalid)?;
    dst.reserve(needed);
    let (result, _read) = decoder.decode_to_string_without_replacement(input, dst, true);
    match result {
        DecoderResult::InputEmpty => Ok(()),
        DecoderResult::Malformed(_, _) | DecoderResult::OutputFull => {
            Err(DecodeError::ShiftJisInvalid)
        }
    }
}

/// Decode Aozora source bytes to UTF-8, detecting the encoding.
///
/// Aozora material reaches this crate in two shapes: the canonical
/// `Shift_JIS` archive files, and already-decoded UTF-8 mirrors (e.g. a
/// corpus that has been pre-converted). Forcing every caller to commit
/// to one encoding up front is the wrong default — it is why corpus
/// tooling historically hard-coded [`decode_sjis`] and silently broke on
/// UTF-8 input. This entry point removes that concern: hand it bytes,
/// get back UTF-8.
///
/// - Valid UTF-8 is returned **borrowed**, zero-copy.
/// - Otherwise the bytes are decoded as `Shift_JIS` (owned).
///
/// UTF-8 is tried first on purpose. Valid UTF-8 is a near-unambiguous
/// signal — `Shift_JIS` Japanese text essentially never forms a wholly
/// valid UTF-8 sequence — whereas the converse does not hold: a UTF-8
/// document can contain byte runs that decode as *some* `Shift_JIS`
/// without erroring, so sniffing `Shift_JIS` first risks mojibake on
/// UTF-8 input.
///
/// BOM stripping, CRLF folding and NFC normalisation are the parser's
/// Phase-0 responsibility and are deliberately not applied here.
///
/// # Errors
///
/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
/// valid UTF-8 nor valid `Shift_JIS`.
pub fn decode_auto(input: &[u8]) -> Result<Cow<'_, str>, DecodeError> {
    if let Ok(text) = from_utf8(input) {
        return Ok(Cow::Borrowed(text));
    }
    decode_sjis(input).map(Cow::Owned)
}

/// Encoding-agnostic counterpart to [`decode_sjis_into`]: append the
/// decoded UTF-8 to `dst`, detecting the source encoding.
///
/// Same sniffing rule as [`decode_auto`] (valid UTF-8 wins, else
/// `Shift_JIS`), but writes into a caller-owned buffer so corpus
/// loaders can reuse one allocation across many documents. The buffer
/// is **not** cleared first — see [`decode_sjis_into`].
///
/// # Errors
///
/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
/// valid UTF-8 nor valid `Shift_JIS`.
pub fn decode_auto_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
    if let Ok(text) = from_utf8(input) {
        dst.push_str(text);
        return Ok(());
    }
    decode_sjis_into(input, dst)
}

/// Whether the byte slice carries a UTF-8 BOM (`EF BB BF`).
///
/// Used by the CLI to strip the BOM before handing input to the parser.
/// BOM presence is the one signal even [`decode_auto`] leaves to the
/// caller: it is valid UTF-8, so it round-trips through `decode_auto`
/// untouched and is stripped by the parser's Phase-0 sanitiser.
#[must_use]
pub const fn has_utf8_bom(input: &[u8]) -> bool {
    matches!(input, [0xEF, 0xBB, 0xBF, ..])
}

pub mod gaiji;
/// PHF tables (single, combo, description) emitted by `build.rs`
/// at compile time via `phf_codegen`. Lives in `OUT_DIR` so it's
/// regenerated automatically when any input TSV changes; the
/// committed source tree carries only the data, not the perfect-
/// hash output. See `build.rs` for the generator.
#[allow(
    clippy::unreadable_literal,
    reason = "phf_codegen emits 64-bit perfect-hash keys without separators; \
              we cannot reformat them without forking the codegen crate"
)]
mod jisx0213_table {
    include!(concat!(env!("OUT_DIR"), "/jisx0213_table.rs"));
}

#[cfg(test)]
mod tests {
    use super::*;

    // ------------------------------------------------------------------
    // SJIS happy-path decoding
    // ------------------------------------------------------------------

    #[test]
    fn decodes_plain_ascii_sjis() {
        assert_eq!(decode_sjis(b"hello").unwrap(), "hello");
    }

    #[test]
    fn decodes_japanese_sjis() {
        // 「青空文庫」 in Shift_JIS.
        let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
        assert_eq!(decode_sjis(bytes).unwrap(), "青空文庫");
    }

    #[test]
    fn decodes_empty_input_to_empty_string() {
        assert_eq!(decode_sjis(b"").unwrap(), "");
    }

    #[test]
    fn decodes_ascii_control_characters_verbatim() {
        // LF / CR / tab are 1:1 identity in SJIS since the lead byte
        // range avoids ASCII. Exercising these locks in the pipeline
        // doesn't mangle them before the sanitize pass.
        assert_eq!(decode_sjis(b"a\nb\rc\td").unwrap(), "a\nb\rc\td");
    }

    #[test]
    fn decodes_halfwidth_katakana() {
        // Halfwidth katakana (0xA1..=0xDF) is a single byte each in SJIS.
        // `アイウエオ` → bytes 0xB1..0xB5.
        let bytes = &[0xB1, 0xB2, 0xB3, 0xB4, 0xB5];
        assert_eq!(decode_sjis(bytes).unwrap(), "アイウエオ");
    }

    #[test]
    fn decodes_mixed_ascii_and_kanji() {
        // Common shape in Aozora corpora: explanatory text in ASCII
        // mixed with Japanese quotations.
        let mut bytes = Vec::from(*b"about ");
        bytes.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); // 日本
        bytes.extend_from_slice(b" !");
        assert_eq!(decode_sjis(&bytes).unwrap(), "about 日本 !");
    }

    #[test]
    fn decodes_hiragana_sjis() {
        // 「こんにちは」 — lead bytes in the 0x82 range.
        let bytes = &[
            0x82, 0xB1, //            0x82, 0xF1, //            0x82, 0xC9, //            0x82, 0xBF, //            0x82, 0xCD, //        ];
        assert_eq!(decode_sjis(bytes).unwrap(), "こんにちは");
    }

    #[test]
    fn decodes_fullwidth_digits() {
        // 123 — fullwidth digits are common in Aozora ruby delimiters.
        let bytes = &[0x82, 0x4F, 0x82, 0x50, 0x82, 0x51];
        assert_eq!(decode_sjis(bytes).unwrap(), "012");
    }

    // ------------------------------------------------------------------
    // decode_auto — encoding-agnostic entry point
    // ------------------------------------------------------------------

    #[test]
    fn decode_auto_passes_utf8_through_borrowed() {
        let bytes = "青空文庫".as_bytes();
        let out = decode_auto(bytes).unwrap();
        assert!(matches!(out, Cow::Borrowed(_)), "UTF-8 must be zero-copy");
        assert_eq!(out, "青空文庫");
    }

    #[test]
    fn decode_auto_falls_back_to_sjis_owned() {
        // 「青空文庫」 in Shift_JIS — not valid UTF-8, so it decodes.
        let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
        let out = decode_auto(bytes).unwrap();
        assert!(
            matches!(out, Cow::Owned(_)),
            "SJIS must be decoded to owned"
        );
        assert_eq!(out, "青空文庫");
    }

    #[test]
    fn decode_auto_borrows_ascii() {
        // ASCII is valid in both encodings; UTF-8-first means borrowed.
        let out = decode_auto(b"hello").unwrap();
        assert!(matches!(out, Cow::Borrowed(_)));
        assert_eq!(out, "hello");
    }

    #[test]
    fn decode_auto_prefers_utf8_over_ambiguous_sjis() {
        // UTF-8 「日本語」 = E3 81 A5 ... whose leading bytes are *also*
        // valid Shift_JIS lead bytes. Sniffing SJIS first would mojibake
        // this; UTF-8-first returns the correct text, borrowed.
        let bytes = "日本語".as_bytes();
        let out = decode_auto(bytes).unwrap();
        assert!(matches!(out, Cow::Borrowed(_)));
        assert_eq!(out, "日本語");
    }

    #[test]
    fn decode_auto_errors_when_neither_encoding_fits() {
        // 0xFF is neither a valid UTF-8 byte nor an assigned Shift_JIS byte.
        assert!(matches!(
            decode_auto(&[0xFF, 0xFF]),
            Err(DecodeError::ShiftJisInvalid)
        ));
    }

    #[test]
    fn decode_auto_empty_is_borrowed_empty() {
        let out = decode_auto(b"").unwrap();
        assert!(matches!(out, Cow::Borrowed(_)));
        assert_eq!(out, "");
    }

    #[test]
    fn decode_auto_into_appends_both_encodings() {
        let mut buf = String::new();
        decode_auto_into("青空".as_bytes(), &mut buf).unwrap(); // UTF-8
        decode_auto_into(&[0x95, 0xB6, 0x8C, 0xC9], &mut buf).unwrap(); // 文庫 in SJIS
        assert_eq!(buf, "青空文庫");
    }

    // ------------------------------------------------------------------
    // decode_sjis_into — buffer-reuse path equivalence
    // ------------------------------------------------------------------
    //
    // Every test below the section header verifies the contract that
    // `decode_sjis(b) == decode_sjis_into(b, &mut buf)` byte-for-byte
    // (and for the strict-error case, returns the same `Err`).
    // `decode_sjis_into` is the buffer-reuse entry point used by the
    // bench `parallel_size_bands` thread-local pool; the production
    // `decode_sjis` is a thin wrapper that calls `decode_sjis_into`
    // with a fresh `String`.

    fn check_equivalent(input: &[u8]) {
        let owned = decode_sjis(input);
        let mut buf = String::new();
        let into_result = decode_sjis_into(input, &mut buf);
        match (owned, into_result) {
            (Ok(s), Ok(())) => assert_eq!(s, buf, "decode_sjis output != decode_sjis_into output"),
            (Err(_), Err(_)) => {} // both fail — identical strict error contract
            (Ok(s), Err(e)) => panic!("owned succeeded ({s:?}) but _into failed ({e:?})"),
            (Err(e), Ok(())) => panic!("owned failed ({e:?}) but _into succeeded ({buf:?})"),
        }
    }

    #[test]
    fn into_equivalent_on_ascii() {
        check_equivalent(b"hello world");
    }

    #[test]
    fn into_equivalent_on_japanese() {
        check_equivalent(&[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9]);
    }

    #[test]
    fn into_equivalent_on_empty() {
        check_equivalent(b"");
    }

    #[test]
    fn into_equivalent_on_halfwidth_katakana() {
        check_equivalent(&[0xB1, 0xB2, 0xB3, 0xB4, 0xB5]);
    }

    #[test]
    fn into_equivalent_on_invalid_lead_byte() {
        check_equivalent(&[0xFF, 0xFF]);
    }

    #[test]
    fn into_equivalent_on_lone_lead_byte() {
        check_equivalent(&[b'o', b'k', 0x82]);
    }

    #[test]
    fn into_reuses_buffer_capacity_across_calls() {
        // The buffer-reuse contract: a `dst` String that already has
        // enough capacity should not allocate again on the second
        // decode. We verify this by asserting capacity is preserved
        // across `clear() + decode_sjis_into` cycles. (Pinning the
        // exact byte count would couple the test to bumpalo /
        // encoding_rs internals; the load-bearing invariant is "no
        // shrink".)
        let mut buf = String::with_capacity(4096);
        let cap_before = buf.capacity();
        decode_sjis_into(b"hello", &mut buf).unwrap();
        let cap_after_first = buf.capacity();
        assert!(
            cap_after_first >= cap_before,
            "capacity must not shrink on small decode"
        );
        buf.clear();
        decode_sjis_into(b"world", &mut buf).unwrap();
        assert!(
            buf.capacity() >= cap_after_first,
            "capacity must not shrink on a buffer-reuse cycle"
        );
    }

    #[test]
    fn into_appends_when_dst_not_cleared() {
        // Documented contract: callers must `clear()` before each
        // decode if they want a fresh result. This test pins that
        // shape so future "convenience clear inside the function"
        // changes break loudly.
        let mut buf = String::from("PRE:");
        decode_sjis_into(b"hi", &mut buf).unwrap();
        assert_eq!(buf, "PRE:hi");
    }

    // ------------------------------------------------------------------
    // SJIS error surfaces
    // ------------------------------------------------------------------

    #[test]
    fn rejects_invalid_lead_byte() {
        let bytes = &[0xFF, 0xFF];
        assert!(matches!(
            decode_sjis(bytes),
            Err(DecodeError::ShiftJisInvalid)
        ));
    }

    #[test]
    fn rejects_lone_lead_byte_at_end_of_input() {
        // 0x82 alone is a truncated two-byte sequence (expects trail).
        let bytes = &[b'o', b'k', 0x82];
        assert!(matches!(
            decode_sjis(bytes),
            Err(DecodeError::ShiftJisInvalid)
        ));
    }

    #[test]
    fn rejects_invalid_trail_byte() {
        // Lead 0x82 with an invalid trail 0x00 (trails must be 0x40..=0xFC, != 0x7F).
        let bytes = &[0x82, 0x00];
        assert!(matches!(
            decode_sjis(bytes),
            Err(DecodeError::ShiftJisInvalid)
        ));
    }

    #[test]
    fn error_message_is_japanese_and_carries_miette_code() {
        // The project-wide rule is that user-facing errors are in
        // Japanese. Pin that and the miette diagnostic code both.
        let err = decode_sjis(&[0xFF, 0xFF]).unwrap_err();
        let message = format!("{err}");
        assert!(
            message.contains("Shift_JIS"),
            "error message must contain Shift_JIS for locatability, got {message:?}",
        );
    }

    // ------------------------------------------------------------------
    // UTF-8 BOM detection
    // ------------------------------------------------------------------

    #[test]
    fn detects_utf8_bom() {
        assert!(has_utf8_bom(b"\xEF\xBB\xBFtext"));
    }

    #[test]
    fn no_utf8_bom_on_plain_input() {
        assert!(!has_utf8_bom(b"text"));
    }

    #[test]
    fn no_utf8_bom_on_shorter_than_bom() {
        assert!(!has_utf8_bom(b"\xEF\xBB"));
    }

    #[test]
    fn no_utf8_bom_on_empty_input() {
        assert!(!has_utf8_bom(b""));
    }

    #[test]
    fn detects_utf8_bom_on_exactly_three_bytes() {
        // Boundary: the slice is exactly `EF BB BF` with no trailing
        // content. `matches!` pattern with `..` rest binding accepts
        // empty tails.
        assert!(has_utf8_bom(&[0xEF, 0xBB, 0xBF]));
    }

    #[test]
    fn bom_detection_rejects_near_misses() {
        // Off-by-one patterns that are NOT the UTF-8 BOM.
        assert!(!has_utf8_bom(&[0xEF, 0xBB, 0xBE])); // last byte wrong
        assert!(!has_utf8_bom(&[0xEE, 0xBB, 0xBF])); // first byte wrong
        assert!(!has_utf8_bom(&[0xEF, 0xBC, 0xBF])); // middle byte wrong
        assert!(!has_utf8_bom(&[0xFE, 0xFF])); // UTF-16 BE BOM — not ours
        assert!(!has_utf8_bom(&[0xFF, 0xFE])); // UTF-16 LE BOM — not ours
    }

    // ------------------------------------------------------------------
    // Gaiji resolution (via primitive `gaiji::lookup`)
    // ------------------------------------------------------------------

    #[test]
    fn gaiji_lookup_echoes_existing_ucs_when_set() {
        assert_eq!(
            gaiji::lookup(Some(''), Some("第3水準1-85-54"), "木+吶のつくり"),
            Some(gaiji::Resolved::Char(''))
        );
    }

    #[test]
    fn gaiji_lookup_returns_none_when_unresolvable() {
        assert_eq!(gaiji::lookup(None, None, "第3水準1-85-54"), None);
    }
}