use std::borrow::Cow;
pub const ACCENT_TABLE: &[(&str, char)] = &[
("ae&", 'æ'),
("AE&", 'Æ'),
("oe&", 'œ'),
("OE&", 'Œ'),
("s&", 'ß'), ("a`", 'à'),
("a'", 'á'),
("a^", 'â'),
("a~", 'ã'),
("a:", 'ä'),
("a&", 'å'),
("a_", 'ā'),
("c,", 'ç'),
("c'", 'ć'),
("c^", 'ĉ'),
("d/", 'đ'),
("e`", 'è'),
("e'", 'é'),
("e^", 'ê'),
("e:", 'ë'),
("e_", 'ē'),
("e~", 'ẽ'),
("g^", 'ĝ'),
("h^", 'ĥ'),
("h/", 'ħ'),
("i`", 'ì'),
("i'", 'í'),
("i^", 'î'),
("i:", 'ï'),
("i_", 'ī'),
("i/", 'ɨ'),
("i~", 'ĩ'),
("j^", 'ĵ'),
("l/", 'ł'),
("l'", 'ĺ'),
("m'", 'ḿ'),
("n`", 'ǹ'),
("n~", 'ñ'),
("n'", 'ń'),
("o`", 'ò'),
("o'", 'ó'),
("o^", 'ô'),
("o~", 'õ'),
("o:", 'ö'),
("o/", 'ø'),
("o_", 'ō'),
("r'", 'ŕ'),
("s'", 'ś'),
("s,", 'ş'),
("s^", 'ŝ'),
("t,", 'ţ'),
("u`", 'ù'),
("u'", 'ú'),
("u^", 'û'),
("u:", 'ü'),
("u_", 'ū'),
("u&", 'ů'),
("u~", 'ũ'),
("y'", 'ý'),
("y:", 'ÿ'),
("z'", 'ź'),
("A`", 'À'),
("A'", 'Á'),
("A^", 'Â'),
("A~", 'Ã'),
("A:", 'Ä'),
("A&", 'Å'),
("A_", 'Ā'),
("C,", 'Ç'),
("C'", 'Ć'),
("C^", 'Ĉ'),
("D/", 'Đ'),
("E`", 'È'),
("E'", 'É'),
("E^", 'Ê'),
("E:", 'Ë'),
("E_", 'Ē'),
("E~", 'Ẽ'),
("G^", 'Ĝ'),
("H^", 'Ĥ'),
("I`", 'Ì'),
("I'", 'Í'),
("I^", 'Î'),
("I:", 'Ï'),
("I_", 'Ī'),
("I~", 'Ĩ'),
("J^", 'Ĵ'),
("L/", 'Ł'),
("L'", 'Ĺ'),
("M'", 'Ḿ'),
("N`", 'Ǹ'),
("N~", 'Ñ'),
("N'", 'Ń'),
("O`", 'Ò'),
("O'", 'Ó'),
("O^", 'Ô'),
("O~", 'Õ'),
("O:", 'Ö'),
("O/", 'Ø'),
("O_", 'Ō'),
("R'", 'Ŕ'),
("S'", 'Ś'),
("S,", 'Ş'),
("S^", 'Ŝ'),
("T,", 'Ţ'),
("U`", 'Ù'),
("U'", 'Ú'),
("U^", 'Û'),
("U:", 'Ü'),
("U_", 'Ū'),
("U&", 'Ů'),
("U~", 'Ũ'),
("Y'", 'Ý'),
("Z'", 'Ź'),
];
pub const ACCENT_MARKERS: &[u8] = b"'`^:~&,/_";
const ACCENT_MARKER_MASK: u128 = {
let mut m: u128 = 0;
let bs = ACCENT_MARKERS;
let mut i = 0;
while i < bs.len() {
m |= 1u128 << bs[i];
i += 1;
}
m
};
const _: () = {
let bs = ACCENT_MARKERS;
let mut i = 0;
while i < bs.len() {
assert!(bs[i] < 128, "ACCENT_MARKERS must stay ASCII-only");
i += 1;
}
};
#[inline]
#[must_use]
pub const fn is_accent_marker(b: u8) -> bool {
(b < 128) && ((ACCENT_MARKER_MASK >> b) & 1) != 0
}
#[inline]
fn match_ligature(head: &[u8]) -> Option<char> {
debug_assert_eq!(head.len(), 3, "match_ligature requires exactly 3 bytes");
match head {
b"ae&" => Some('æ'),
b"AE&" => Some('Æ'),
b"oe&" => Some('œ'),
b"OE&" => Some('Œ'),
_ => None,
}
}
static ACCENT_DIGRAPHS: phf::Map<&'static [u8], char> = phf::phf_map! {
b"s&" => 'ß',
b"a`" => 'à', b"a'" => 'á', b"a^" => 'â', b"a~" => 'ã',
b"a:" => 'ä', b"a&" => 'å', b"a_" => 'ā',
b"c," => 'ç', b"c'" => 'ć', b"c^" => 'ĉ',
b"d/" => 'đ',
b"e`" => 'è', b"e'" => 'é', b"e^" => 'ê', b"e:" => 'ë',
b"e_" => 'ē', b"e~" => 'ẽ',
b"g^" => 'ĝ',
b"h^" => 'ĥ', b"h/" => 'ħ',
b"i`" => 'ì', b"i'" => 'í', b"i^" => 'î', b"i:" => 'ï',
b"i_" => 'ī', b"i/" => 'ɨ', b"i~" => 'ĩ',
b"j^" => 'ĵ',
b"l/" => 'ł', b"l'" => 'ĺ',
b"m'" => 'ḿ',
b"n`" => 'ǹ', b"n~" => 'ñ', b"n'" => 'ń',
b"o`" => 'ò', b"o'" => 'ó', b"o^" => 'ô', b"o~" => 'õ',
b"o:" => 'ö', b"o/" => 'ø', b"o_" => 'ō',
b"r'" => 'ŕ',
b"s'" => 'ś', b"s," => 'ş', b"s^" => 'ŝ',
b"t," => 'ţ',
b"u`" => 'ù', b"u'" => 'ú', b"u^" => 'û', b"u:" => 'ü',
b"u_" => 'ū', b"u&" => 'ů', b"u~" => 'ũ',
b"y'" => 'ý', b"y:" => 'ÿ',
b"z'" => 'ź',
b"A`" => 'À', b"A'" => 'Á', b"A^" => 'Â', b"A~" => 'Ã',
b"A:" => 'Ä', b"A&" => 'Å', b"A_" => 'Ā',
b"C," => 'Ç', b"C'" => 'Ć', b"C^" => 'Ĉ',
b"D/" => 'Đ',
b"E`" => 'È', b"E'" => 'É', b"E^" => 'Ê', b"E:" => 'Ë',
b"E_" => 'Ē', b"E~" => 'Ẽ',
b"G^" => 'Ĝ',
b"H^" => 'Ĥ',
b"I`" => 'Ì', b"I'" => 'Í', b"I^" => 'Î', b"I:" => 'Ï',
b"I_" => 'Ī', b"I~" => 'Ĩ',
b"J^" => 'Ĵ',
b"L/" => 'Ł', b"L'" => 'Ĺ',
b"M'" => 'Ḿ',
b"N`" => 'Ǹ', b"N~" => 'Ñ', b"N'" => 'Ń',
b"O`" => 'Ò', b"O'" => 'Ó', b"O^" => 'Ô', b"O~" => 'Õ',
b"O:" => 'Ö', b"O/" => 'Ø', b"O_" => 'Ō',
b"R'" => 'Ŕ',
b"S'" => 'Ś', b"S," => 'Ş', b"S^" => 'Ŝ',
b"T," => 'Ţ',
b"U`" => 'Ù', b"U'" => 'Ú', b"U^" => 'Û', b"U:" => 'Ü',
b"U_" => 'Ū', b"U&" => 'Ů', b"U~" => 'Ũ',
b"Y'" => 'Ý',
b"Z'" => 'Ź',
};
const _: () = {
assert!(
ACCENT_DIGRAPHS.len() == 110,
"ACCENT_DIGRAPHS must contain exactly 110 entries (114 spec − 4 ligatures)"
);
};
#[must_use]
pub fn decompose_fragment(fragment: &str) -> Cow<'_, str> {
let bytes = fragment.as_bytes();
if !bytes.iter().any(|b| is_accent_marker(*b)) {
return Cow::Borrowed(fragment);
}
let mut out = String::with_capacity(fragment.len());
let mut i = 0;
while i < bytes.len() {
if let Some((pat_len, ch)) = try_match(bytes, i) {
out.push(ch);
i += pat_len;
} else {
let Some(ch) = fragment.get(i..).and_then(|s| s.chars().next()) else {
break;
};
out.push(ch);
i += ch.len_utf8();
}
}
Cow::Owned(out)
}
#[inline]
fn try_match(bytes: &[u8], i: usize) -> Option<(usize, char)> {
if i + 3 <= bytes.len()
&& let Some(ch) = match_ligature(&bytes[i..i + 3])
{
return Some((3, ch));
}
if i + 2 <= bytes.len()
&& let Some(&ch) = ACCENT_DIGRAPHS.get(&bytes[i..i + 2])
{
return Some((2, ch));
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn table_size_is_pinned_to_spec_count() {
const EXPECTED: usize = 114;
assert_eq!(
ACCENT_TABLE.len(),
EXPECTED,
"spec count drift — see docs/specs/aozora/accent_separation.html"
);
}
#[test]
fn every_table_entry_is_representable_ascii_source() {
for (pat, _) in ACCENT_TABLE {
assert!(
pat.is_ascii(),
"digraph {pat:?} must be pure ASCII per spec"
);
assert!(
pat.len() == 2 || pat.len() == 3,
"digraph {pat:?} must be 2 or 3 bytes"
);
}
}
#[test]
fn every_table_entry_has_unique_pattern() {
use std::collections::HashSet;
let mut seen: HashSet<&str> = HashSet::new();
for (pat, _) in ACCENT_TABLE {
assert!(seen.insert(pat), "duplicate digraph {pat:?}");
}
}
#[test]
fn digraph_size_growth_stays_within_one_extra_byte() {
for (pat, ch) in ACCENT_TABLE {
let out_len = ch.len_utf8();
let in_len = pat.len();
let growth = out_len.saturating_sub(in_len);
assert!(
growth <= 1,
"digraph {pat:?} → {ch} grew by {growth} bytes (cap is 1)"
);
}
}
#[test]
fn spec_point_e_grave() {
assert_eq!(decompose_fragment("fune`bre"), "funèbre");
}
#[test]
fn spec_point_acute_accents() {
assert_eq!(decompose_fragment("ve'rite'"), "vérité");
}
#[test]
fn spec_point_circumflex_and_cedilla_together() {
assert_eq!(decompose_fragment("C,a va^"), "Ça vâ");
}
#[test]
fn spec_point_all_vowel_graves() {
assert_eq!(decompose_fragment("a` e` i` o` u`"), "à è ì ò ù");
}
#[test]
fn spec_point_uppercase_accents() {
assert_eq!(decompose_fragment("A` E' N~"), "À É Ñ");
}
#[test]
fn spec_point_ligatures_beat_ring_above() {
assert_eq!(decompose_fragment("stras&e"), "straße");
assert_eq!(decompose_fragment("ae&on"), "æon");
assert_eq!(decompose_fragment("OE&uvre"), "Œuvre");
}
#[test]
fn spec_point_stroke_and_macron() {
assert_eq!(decompose_fragment("d/o_g"), "đōg");
}
#[test]
fn input_without_any_marker_byte_is_borrowed() {
let input = "plain Japanese prose ここはテストです 春夏秋冬";
let out = decompose_fragment(input);
assert!(
matches!(out, Cow::Borrowed(_)),
"expected zero-alloc path for {input:?}"
);
assert_eq!(out, input);
}
#[test]
fn isolated_markers_not_preceded_by_table_base_are_preserved() {
assert_eq!(decompose_fragment("'tis"), "'tis"); assert_eq!(decompose_fragment("5^2"), "5^2"); assert_eq!(decompose_fragment("q^"), "q^"); }
#[test]
fn markers_are_greedy_for_any_valid_preceding_base() {
assert_eq!(decompose_fragment("`hello`"), "`hellò"); assert_eq!(decompose_fragment("text,"), "texţ"); }
#[test]
fn unknown_base_letters_stay_unchanged() {
assert_eq!(decompose_fragment("f'x"), "f'x");
assert_eq!(decompose_fragment("q^"), "q^");
}
#[test]
fn mixed_japanese_and_accents_round_trip_on_japanese() {
assert_eq!(
decompose_fragment("ここは fune`bre です"),
"ここは funèbre です"
);
}
#[test]
fn empty_input_is_borrowed() {
let out = decompose_fragment("");
assert!(matches!(out, Cow::Borrowed("")));
}
#[test]
fn three_byte_ligatures_shrink_output_byte_length() {
for (input, expected) in [("ae&on", "æon"), ("OE&uvre", "Œuvre")] {
let out = decompose_fragment(input);
assert!(
out.len() < input.len(),
"3-byte ligature should shrink: {input:?} → {out:?}"
);
assert_eq!(out, expected);
}
}
#[test]
fn two_byte_eszett_preserves_output_byte_length() {
let out = decompose_fragment("stras&e");
assert_eq!(out, "straße");
assert_eq!(out.len(), "stras&e".len());
}
#[test]
fn bmp_above_u1e00_digraphs_may_grow_output() {
let out = decompose_fragment("m'a");
assert_eq!(out, "ḿa");
assert!(out.len() > "m'a".len());
}
#[test]
fn property_all_table_entries_round_trip() {
for (pat, ch) in ACCENT_TABLE {
let input = format!("_{pat}_");
let out = decompose_fragment(&input);
let expected: String = format!("_{ch}_");
assert_eq!(*out, *expected, "pattern {pat:?} failed");
}
}
}