aozora_encoding/lib.rs
1//! Encoding utilities for Aozora Bunko source material.
2//!
3//! The `aozora` parser itself is strictly UTF-8. Anything that decodes `Shift_JIS` or
4//! resolves gaiji (外字) mappings lives here, so the parser stays free of encoding
5//! concerns and the same logic is available to CLI, editor integrations, or
6//! downstream tools.
7
8#![forbid(unsafe_code)]
9
10use std::borrow::Cow;
11use std::str::from_utf8;
12
13use encoding_rs::{DecoderResult, SHIFT_JIS};
14use miette::Diagnostic;
15use thiserror::Error;
16
17/// Errors surfaced by the decode pipeline.
18#[derive(Debug, Error, Diagnostic)]
19#[non_exhaustive]
20pub enum DecodeError {
21 #[error("Shift_JIS からの変換に失敗しました (不正なバイト列)")]
22 #[diagnostic(code(aozora::encoding::sjis_invalid))]
23 ShiftJisInvalid,
24}
25
26/// Decode a `Shift_JIS` byte slice into UTF-8 (NFC normalisation is applied by the
27/// caller after decoding).
28///
29/// # Errors
30///
31/// Returns [`DecodeError::ShiftJisInvalid`] if `encoding_rs` reports a malformed byte
32/// sequence. Lossy replacement is deliberately not offered — callers need to know
33/// when they're looking at corrupted source material rather than silently absorbing
34/// the damage.
35///
36/// Allocates a fresh `String` per call. For workloads that decode many
37/// documents in succession, prefer [`decode_sjis_into`] with a reusable
38/// buffer to avoid the per-call allocation.
39pub fn decode_sjis(input: &[u8]) -> Result<String, DecodeError> {
40 let mut out = String::new();
41 decode_sjis_into(input, &mut out)?;
42 Ok(out)
43}
44
45/// Decode a `Shift_JIS` byte slice into the caller-owned `dst` buffer.
46///
47/// Pre-sizes `dst` exactly via
48/// `encoding_rs::Decoder::max_utf8_buffer_length_without_replacement`
49/// so the decode inner loop does no growth-realloc. The buffer is
50/// **not** cleared first — callers that want a fresh decode should
51/// `dst.clear()` before calling. This is intentional so the same
52/// buffer can be reused across many decodes in a thread-local /
53/// per-worker pool without paying the allocator per iteration.
54///
55/// Strict — same error contract as [`decode_sjis`]. Bypasses
56/// `encoding_rs`'s public `decode` shape, which always allocates a
57/// worst-case-sized `String` internally and `Cow::into_owned`s the
58/// result; this entry point goes straight through the
59/// `Decoder::decode_to_string_without_replacement` API the bench
60/// pipeline needs.
61///
62/// # Errors
63///
64/// Returns [`DecodeError::ShiftJisInvalid`] on malformed input or if
65/// the encoder reports overflow (which `max_utf8_buffer_length_…`
66/// should make unreachable, but is still surfaced rather than
67/// silently truncated).
68pub fn decode_sjis_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
69 let mut decoder = SHIFT_JIS.new_decoder_without_bom_handling();
70 let needed = decoder
71 .max_utf8_buffer_length_without_replacement(input.len())
72 .ok_or(DecodeError::ShiftJisInvalid)?;
73 dst.reserve(needed);
74 let (result, _read) = decoder.decode_to_string_without_replacement(input, dst, true);
75 match result {
76 DecoderResult::InputEmpty => Ok(()),
77 DecoderResult::Malformed(_, _) | DecoderResult::OutputFull => {
78 Err(DecodeError::ShiftJisInvalid)
79 }
80 }
81}
82
83/// Decode Aozora source bytes to UTF-8, detecting the encoding.
84///
85/// Aozora material reaches this crate in two shapes: the canonical
86/// `Shift_JIS` archive files, and already-decoded UTF-8 mirrors (e.g. a
87/// corpus that has been pre-converted). Forcing every caller to commit
88/// to one encoding up front is the wrong default — it is why corpus
89/// tooling historically hard-coded [`decode_sjis`] and silently broke on
90/// UTF-8 input. This entry point removes that concern: hand it bytes,
91/// get back UTF-8.
92///
93/// - Valid UTF-8 is returned **borrowed**, zero-copy.
94/// - Otherwise the bytes are decoded as `Shift_JIS` (owned).
95///
96/// UTF-8 is tried first on purpose. Valid UTF-8 is a near-unambiguous
97/// signal — `Shift_JIS` Japanese text essentially never forms a wholly
98/// valid UTF-8 sequence — whereas the converse does not hold: a UTF-8
99/// document can contain byte runs that decode as *some* `Shift_JIS`
100/// without erroring, so sniffing `Shift_JIS` first risks mojibake on
101/// UTF-8 input.
102///
103/// BOM stripping, CRLF folding and NFC normalisation are the parser's
104/// Phase-0 responsibility and are deliberately not applied here.
105///
106/// # Errors
107///
108/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
109/// valid UTF-8 nor valid `Shift_JIS`.
110pub fn decode_auto(input: &[u8]) -> Result<Cow<'_, str>, DecodeError> {
111 if let Ok(text) = from_utf8(input) {
112 return Ok(Cow::Borrowed(text));
113 }
114 decode_sjis(input).map(Cow::Owned)
115}
116
117/// Encoding-agnostic counterpart to [`decode_sjis_into`]: append the
118/// decoded UTF-8 to `dst`, detecting the source encoding.
119///
120/// Same sniffing rule as [`decode_auto`] (valid UTF-8 wins, else
121/// `Shift_JIS`), but writes into a caller-owned buffer so corpus
122/// loaders can reuse one allocation across many documents. The buffer
123/// is **not** cleared first — see [`decode_sjis_into`].
124///
125/// # Errors
126///
127/// Returns [`DecodeError::ShiftJisInvalid`] when the bytes are neither
128/// valid UTF-8 nor valid `Shift_JIS`.
129pub fn decode_auto_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
130 if let Ok(text) = from_utf8(input) {
131 dst.push_str(text);
132 return Ok(());
133 }
134 decode_sjis_into(input, dst)
135}
136
137/// Whether the byte slice carries a UTF-8 BOM (`EF BB BF`).
138///
139/// Used by the CLI to strip the BOM before handing input to the parser.
140/// BOM presence is the one signal even [`decode_auto`] leaves to the
141/// caller: it is valid UTF-8, so it round-trips through `decode_auto`
142/// untouched and is stripped by the parser's Phase-0 sanitiser.
143#[must_use]
144pub const fn has_utf8_bom(input: &[u8]) -> bool {
145 matches!(input, [0xEF, 0xBB, 0xBF, ..])
146}
147
148pub mod gaiji;
149/// PHF tables (single, combo, description) emitted by `build.rs`
150/// at compile time via `phf_codegen`. Lives in `OUT_DIR` so it's
151/// regenerated automatically when any input TSV changes; the
152/// committed source tree carries only the data, not the perfect-
153/// hash output. See `build.rs` for the generator.
154#[allow(
155 clippy::unreadable_literal,
156 reason = "phf_codegen emits 64-bit perfect-hash keys without separators; \
157 we cannot reformat them without forking the codegen crate"
158)]
159mod jisx0213_table {
160 include!(concat!(env!("OUT_DIR"), "/jisx0213_table.rs"));
161}
162
163#[cfg(test)]
164mod tests {
165 use super::*;
166
167 // ------------------------------------------------------------------
168 // SJIS happy-path decoding
169 // ------------------------------------------------------------------
170
171 #[test]
172 fn decodes_plain_ascii_sjis() {
173 assert_eq!(decode_sjis(b"hello").unwrap(), "hello");
174 }
175
176 #[test]
177 fn decodes_japanese_sjis() {
178 // 「青空文庫」 in Shift_JIS.
179 let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
180 assert_eq!(decode_sjis(bytes).unwrap(), "青空文庫");
181 }
182
183 #[test]
184 fn decodes_empty_input_to_empty_string() {
185 assert_eq!(decode_sjis(b"").unwrap(), "");
186 }
187
188 #[test]
189 fn decodes_ascii_control_characters_verbatim() {
190 // LF / CR / tab are 1:1 identity in SJIS since the lead byte
191 // range avoids ASCII. Exercising these locks in the pipeline
192 // doesn't mangle them before the sanitize pass.
193 assert_eq!(decode_sjis(b"a\nb\rc\td").unwrap(), "a\nb\rc\td");
194 }
195
196 #[test]
197 fn decodes_halfwidth_katakana() {
198 // Halfwidth katakana (0xA1..=0xDF) is a single byte each in SJIS.
199 // `アイウエオ` → bytes 0xB1..0xB5.
200 let bytes = &[0xB1, 0xB2, 0xB3, 0xB4, 0xB5];
201 assert_eq!(decode_sjis(bytes).unwrap(), "アイウエオ");
202 }
203
204 #[test]
205 fn decodes_mixed_ascii_and_kanji() {
206 // Common shape in Aozora corpora: explanatory text in ASCII
207 // mixed with Japanese quotations.
208 let mut bytes = Vec::from(*b"about ");
209 bytes.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); // 日本
210 bytes.extend_from_slice(b" !");
211 assert_eq!(decode_sjis(&bytes).unwrap(), "about 日本 !");
212 }
213
214 #[test]
215 fn decodes_hiragana_sjis() {
216 // 「こんにちは」 — lead bytes in the 0x82 range.
217 let bytes = &[
218 0x82, 0xB1, // こ
219 0x82, 0xF1, // ん
220 0x82, 0xC9, // に
221 0x82, 0xBF, // ち
222 0x82, 0xCD, // は
223 ];
224 assert_eq!(decode_sjis(bytes).unwrap(), "こんにちは");
225 }
226
227 #[test]
228 fn decodes_fullwidth_digits() {
229 // 123 — fullwidth digits are common in Aozora ruby delimiters.
230 let bytes = &[0x82, 0x4F, 0x82, 0x50, 0x82, 0x51];
231 assert_eq!(decode_sjis(bytes).unwrap(), "012");
232 }
233
234 // ------------------------------------------------------------------
235 // decode_auto — encoding-agnostic entry point
236 // ------------------------------------------------------------------
237
238 #[test]
239 fn decode_auto_passes_utf8_through_borrowed() {
240 let bytes = "青空文庫".as_bytes();
241 let out = decode_auto(bytes).unwrap();
242 assert!(matches!(out, Cow::Borrowed(_)), "UTF-8 must be zero-copy");
243 assert_eq!(out, "青空文庫");
244 }
245
246 #[test]
247 fn decode_auto_falls_back_to_sjis_owned() {
248 // 「青空文庫」 in Shift_JIS — not valid UTF-8, so it decodes.
249 let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
250 let out = decode_auto(bytes).unwrap();
251 assert!(
252 matches!(out, Cow::Owned(_)),
253 "SJIS must be decoded to owned"
254 );
255 assert_eq!(out, "青空文庫");
256 }
257
258 #[test]
259 fn decode_auto_borrows_ascii() {
260 // ASCII is valid in both encodings; UTF-8-first means borrowed.
261 let out = decode_auto(b"hello").unwrap();
262 assert!(matches!(out, Cow::Borrowed(_)));
263 assert_eq!(out, "hello");
264 }
265
266 #[test]
267 fn decode_auto_prefers_utf8_over_ambiguous_sjis() {
268 // UTF-8 「日本語」 = E3 81 A5 ... whose leading bytes are *also*
269 // valid Shift_JIS lead bytes. Sniffing SJIS first would mojibake
270 // this; UTF-8-first returns the correct text, borrowed.
271 let bytes = "日本語".as_bytes();
272 let out = decode_auto(bytes).unwrap();
273 assert!(matches!(out, Cow::Borrowed(_)));
274 assert_eq!(out, "日本語");
275 }
276
277 #[test]
278 fn decode_auto_errors_when_neither_encoding_fits() {
279 // 0xFF is neither a valid UTF-8 byte nor an assigned Shift_JIS byte.
280 assert!(matches!(
281 decode_auto(&[0xFF, 0xFF]),
282 Err(DecodeError::ShiftJisInvalid)
283 ));
284 }
285
286 #[test]
287 fn decode_auto_empty_is_borrowed_empty() {
288 let out = decode_auto(b"").unwrap();
289 assert!(matches!(out, Cow::Borrowed(_)));
290 assert_eq!(out, "");
291 }
292
293 #[test]
294 fn decode_auto_into_appends_both_encodings() {
295 let mut buf = String::new();
296 decode_auto_into("青空".as_bytes(), &mut buf).unwrap(); // UTF-8
297 decode_auto_into(&[0x95, 0xB6, 0x8C, 0xC9], &mut buf).unwrap(); // 文庫 in SJIS
298 assert_eq!(buf, "青空文庫");
299 }
300
301 // ------------------------------------------------------------------
302 // decode_sjis_into — buffer-reuse path equivalence
303 // ------------------------------------------------------------------
304 //
305 // Every test below the section header verifies the contract that
306 // `decode_sjis(b) == decode_sjis_into(b, &mut buf)` byte-for-byte
307 // (and for the strict-error case, returns the same `Err`).
308 // `decode_sjis_into` is the buffer-reuse entry point used by the
309 // bench `parallel_size_bands` thread-local pool; the production
310 // `decode_sjis` is a thin wrapper that calls `decode_sjis_into`
311 // with a fresh `String`.
312
313 fn check_equivalent(input: &[u8]) {
314 let owned = decode_sjis(input);
315 let mut buf = String::new();
316 let into_result = decode_sjis_into(input, &mut buf);
317 match (owned, into_result) {
318 (Ok(s), Ok(())) => assert_eq!(s, buf, "decode_sjis output != decode_sjis_into output"),
319 (Err(_), Err(_)) => {} // both fail — identical strict error contract
320 (Ok(s), Err(e)) => panic!("owned succeeded ({s:?}) but _into failed ({e:?})"),
321 (Err(e), Ok(())) => panic!("owned failed ({e:?}) but _into succeeded ({buf:?})"),
322 }
323 }
324
325 #[test]
326 fn into_equivalent_on_ascii() {
327 check_equivalent(b"hello world");
328 }
329
330 #[test]
331 fn into_equivalent_on_japanese() {
332 check_equivalent(&[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9]);
333 }
334
335 #[test]
336 fn into_equivalent_on_empty() {
337 check_equivalent(b"");
338 }
339
340 #[test]
341 fn into_equivalent_on_halfwidth_katakana() {
342 check_equivalent(&[0xB1, 0xB2, 0xB3, 0xB4, 0xB5]);
343 }
344
345 #[test]
346 fn into_equivalent_on_invalid_lead_byte() {
347 check_equivalent(&[0xFF, 0xFF]);
348 }
349
350 #[test]
351 fn into_equivalent_on_lone_lead_byte() {
352 check_equivalent(&[b'o', b'k', 0x82]);
353 }
354
355 #[test]
356 fn into_reuses_buffer_capacity_across_calls() {
357 // The buffer-reuse contract: a `dst` String that already has
358 // enough capacity should not allocate again on the second
359 // decode. We verify this by asserting capacity is preserved
360 // across `clear() + decode_sjis_into` cycles. (Pinning the
361 // exact byte count would couple the test to bumpalo /
362 // encoding_rs internals; the load-bearing invariant is "no
363 // shrink".)
364 let mut buf = String::with_capacity(4096);
365 let cap_before = buf.capacity();
366 decode_sjis_into(b"hello", &mut buf).unwrap();
367 let cap_after_first = buf.capacity();
368 assert!(
369 cap_after_first >= cap_before,
370 "capacity must not shrink on small decode"
371 );
372 buf.clear();
373 decode_sjis_into(b"world", &mut buf).unwrap();
374 assert!(
375 buf.capacity() >= cap_after_first,
376 "capacity must not shrink on a buffer-reuse cycle"
377 );
378 }
379
380 #[test]
381 fn into_appends_when_dst_not_cleared() {
382 // Documented contract: callers must `clear()` before each
383 // decode if they want a fresh result. This test pins that
384 // shape so future "convenience clear inside the function"
385 // changes break loudly.
386 let mut buf = String::from("PRE:");
387 decode_sjis_into(b"hi", &mut buf).unwrap();
388 assert_eq!(buf, "PRE:hi");
389 }
390
391 // ------------------------------------------------------------------
392 // SJIS error surfaces
393 // ------------------------------------------------------------------
394
395 #[test]
396 fn rejects_invalid_lead_byte() {
397 let bytes = &[0xFF, 0xFF];
398 assert!(matches!(
399 decode_sjis(bytes),
400 Err(DecodeError::ShiftJisInvalid)
401 ));
402 }
403
404 #[test]
405 fn rejects_lone_lead_byte_at_end_of_input() {
406 // 0x82 alone is a truncated two-byte sequence (expects trail).
407 let bytes = &[b'o', b'k', 0x82];
408 assert!(matches!(
409 decode_sjis(bytes),
410 Err(DecodeError::ShiftJisInvalid)
411 ));
412 }
413
414 #[test]
415 fn rejects_invalid_trail_byte() {
416 // Lead 0x82 with an invalid trail 0x00 (trails must be 0x40..=0xFC, != 0x7F).
417 let bytes = &[0x82, 0x00];
418 assert!(matches!(
419 decode_sjis(bytes),
420 Err(DecodeError::ShiftJisInvalid)
421 ));
422 }
423
424 #[test]
425 fn error_message_is_japanese_and_carries_miette_code() {
426 // The project-wide rule is that user-facing errors are in
427 // Japanese. Pin that and the miette diagnostic code both.
428 let err = decode_sjis(&[0xFF, 0xFF]).unwrap_err();
429 let message = format!("{err}");
430 assert!(
431 message.contains("Shift_JIS"),
432 "error message must contain Shift_JIS for locatability, got {message:?}",
433 );
434 }
435
436 // ------------------------------------------------------------------
437 // UTF-8 BOM detection
438 // ------------------------------------------------------------------
439
440 #[test]
441 fn detects_utf8_bom() {
442 assert!(has_utf8_bom(b"\xEF\xBB\xBFtext"));
443 }
444
445 #[test]
446 fn no_utf8_bom_on_plain_input() {
447 assert!(!has_utf8_bom(b"text"));
448 }
449
450 #[test]
451 fn no_utf8_bom_on_shorter_than_bom() {
452 assert!(!has_utf8_bom(b"\xEF\xBB"));
453 }
454
455 #[test]
456 fn no_utf8_bom_on_empty_input() {
457 assert!(!has_utf8_bom(b""));
458 }
459
460 #[test]
461 fn detects_utf8_bom_on_exactly_three_bytes() {
462 // Boundary: the slice is exactly `EF BB BF` with no trailing
463 // content. `matches!` pattern with `..` rest binding accepts
464 // empty tails.
465 assert!(has_utf8_bom(&[0xEF, 0xBB, 0xBF]));
466 }
467
468 #[test]
469 fn bom_detection_rejects_near_misses() {
470 // Off-by-one patterns that are NOT the UTF-8 BOM.
471 assert!(!has_utf8_bom(&[0xEF, 0xBB, 0xBE])); // last byte wrong
472 assert!(!has_utf8_bom(&[0xEE, 0xBB, 0xBF])); // first byte wrong
473 assert!(!has_utf8_bom(&[0xEF, 0xBC, 0xBF])); // middle byte wrong
474 assert!(!has_utf8_bom(&[0xFE, 0xFF])); // UTF-16 BE BOM — not ours
475 assert!(!has_utf8_bom(&[0xFF, 0xFE])); // UTF-16 LE BOM — not ours
476 }
477
478 // ------------------------------------------------------------------
479 // Gaiji resolution (via primitive `gaiji::lookup`)
480 // ------------------------------------------------------------------
481
482 #[test]
483 fn gaiji_lookup_echoes_existing_ucs_when_set() {
484 assert_eq!(
485 gaiji::lookup(Some('吶'), Some("第3水準1-85-54"), "木+吶のつくり"),
486 Some(gaiji::Resolved::Char('吶'))
487 );
488 }
489
490 #[test]
491 fn gaiji_lookup_returns_none_when_unresolvable() {
492 assert_eq!(gaiji::lookup(None, None, "第3水準1-85-54"), None);
493 }
494}