1use std::io;
2use std::str::Utf8Error;
3
4use crate::ngstr::prims::enc_surrogates;
5use crate::unicode::utf8_char_width;
6use crate::Cesu8Error;
7use crate::Variant;
8
9pub(crate) fn utf8_as_cesu8_spec<const ENCODE_NUL: bool>(
11 text: &str,
12) -> Result<(), Cesu8Error> {
13 let mut i = 0;
14 let text_bytes = text.as_bytes();
15 while i < text_bytes.len() {
16 let b = text_bytes[i];
18 if ENCODE_NUL && b == b'\0' {
19 return Err(Cesu8Error::new(i, Some(1), Ok(())));
20 }
21
22 if b.is_ascii() {
24 i += 1;
25 continue;
26 }
27
28 let w = utf8_char_width(b);
29
30 if w == 4 {
32 return Err(Cesu8Error::new(i, Some(4), Ok(())));
34 }
35
36 assert_ne!(w, 0, "utf8 char length was 0, this is illegal in well-formed utf8 strings (byte {b:x?}, bytes[{i}] from {text_bytes:x?})");
39 i += w;
40 }
41
42 Ok(())
43}
44
45#[inline]
47pub(crate) fn utf8_as_cesu8(
48 text: &str,
49 variant: Variant,
50) -> Result<(), Cesu8Error> {
51 match variant {
52 Variant::Standard => utf8_as_cesu8_spec::<false>(text),
53 Variant::Java => utf8_as_cesu8_spec::<true>(text),
54 }
55}
56
57pub(crate) unsafe fn utf8_to_cesu8_spec<W: io::Write, const ENCODE_NUL: bool>(
68 text: &str,
69 assume_good: usize,
70 encoded: &mut W,
71) -> io::Result<Result<(), Utf8Error>> {
72 if assume_good != 0 {
74 debug_assert_eq!(
76 utf8_as_cesu8_spec::<ENCODE_NUL>(text)
77 .unwrap_err()
78 .valid_up_to(),
79 assume_good,
80 "tried to assume invalid CESU-8 as good"
81 );
82 debug_assert!(
83 assume_good <= text.len(),
84 "tried to assume_good a chunk larger than the source"
85 );
86 }
87
88 #[inline(always)]
89 fn utf8_to_cesu8_prealloc_internal<W: io::Write, const ENCODE_NUL: bool>(
90 text: &str,
91 assume_good: usize,
92 encoded: &mut W,
93 ) -> io::Result<Result<(), Utf8Error>> {
94 let bytes = text.as_bytes();
95
96 encoded.write_all(&bytes[..assume_good])?;
97
98 let mut i = assume_good;
102 let mut utf8_seg = 0;
103 let mut utf8_err = Ok(());
104
105 let mut written = assume_good;
107
108 macro_rules! write_cesu8 {
109 ($cesu8_slice: expr, $text_len: expr) => {
110 let sl: &[u8] = $cesu8_slice;
111 encoded.write_all(sl)?;
112 written += sl.len();
113 i += $text_len;
114 };
115 }
116
117 macro_rules! push_utf8 {
118 ($errlen: expr) => {
119 if utf8_seg > 0 {
120 write_cesu8!(&bytes[i..i + utf8_seg], utf8_seg);
122
123 utf8_seg = 0;
124 }
125
126 if let Some(err) = $errlen {
128 if utf8_err.is_ok() {
129 utf8_err = Err(utf8err_new(written, err));
130 }
131 }
132 };
133 }
134
135 while let Some(&b) = bytes.get(i + utf8_seg) {
137 if ENCODE_NUL && b == b'\0' {
139 push_utf8!(Some(Some(1))); write_cesu8!(&[0xC0, 0x80], 1);
143 } else if b.is_ascii() {
144 utf8_seg += 1;
146 } else {
147 match utf8_char_width(b) {
148 4 => {
149 push_utf8!(Some(Some(1)));
150
151 let s = unsafe { std::str::from_utf8_unchecked(&bytes[i..i + 4]) };
153 let c = s.chars().next().unwrap() as u32;
154
155 write_cesu8!(&enc_surrogates(c), 4);
156 }
157 w => {
158 utf8_seg += w;
160 }
161 }
162 }
163 }
164
165 push_utf8!(None);
166
167 debug_assert_eq!(i, text.len(), "did not fully consume the input text bytes");
169 debug_assert_eq!(
170 utf8_seg, 0,
171 "did not fully consume the current utf8 segment"
172 );
173
174 Ok(utf8_err)
175 }
176
177 utf8_to_cesu8_prealloc_internal::<W, ENCODE_NUL>(text, assume_good, encoded)
178}
179
180#[inline]
181pub(crate) fn utf8_to_cesu8_safe(
182 text: &str,
183 encoded: &mut Vec<u8>,
184 variant: Variant,
185) -> Result<(), Utf8Error> {
186 unsafe {
187 utf8_to_cesu8(text, 0, encoded, variant).expect(
188 "io::Error occured within Vec's io::Write implementation. This should not happen.",
189 )
190 }
191}
192
193#[inline]
194pub(crate) unsafe fn utf8_to_cesu8<W: io::Write>(
195 text: &str,
196 assume_good: usize,
197 encoded: &mut W,
198 variant: Variant,
199) -> io::Result<Result<(), Utf8Error>> {
200 match variant {
201 Variant::Standard => utf8_to_cesu8_spec::<W, false>(text, assume_good, encoded),
202 Variant::Java => utf8_to_cesu8_spec::<W, true>(text, assume_good, encoded),
203 }
204}
205
206
207
208#[inline]
212pub(crate) fn utf8err_new(valid_up_to: usize, err_len: Option<u8>) -> Utf8Error {
213 #[allow(dead_code)]
214 struct CustomUtf8Error {
215 valid_up_to: usize,
216 err_len: Option<u8>,
217 }
218
219 let err = CustomUtf8Error {
220 valid_up_to,
221 err_len,
222 };
223
224 debug_assert_eq!(
226 std::mem::align_of::<CustomUtf8Error>(),
227 std::mem::align_of::<Utf8Error>(),
228 "std::str::Utf8Error has unexpectedly changed alignment"
229 );
230 debug_assert_eq!(
231 std::mem::size_of::<CustomUtf8Error>(),
232 std::mem::size_of::<Utf8Error>(),
233 "std::str::Utf8Error has unexpectedly changed alignment"
234 );
235
236 unsafe { std::mem::transmute(err) }
237}
238
239#[inline]
240pub(crate) fn utf8err_inc(err: &Utf8Error, incby: usize) -> Utf8Error {
241 utf8err_new(incby + err.valid_up_to(), err.error_len().map(|b| b as u8))
242}