1use encoding_rs::{DecoderResult, Encoding, GBK, SHIFT_JIS};
2use std::borrow::Cow;
3use std::str::Utf8Error;
4use std::sync::Arc;
5
6use mel_syntax::{SourceMapEdit, TextRange, range_end, range_start, text_range};
7
8use crate::{DecodeDiagnostic, SourceEncoding};
9
10pub(crate) struct DecodedSource<'a> {
11 pub(crate) encoding: SourceEncoding,
12 pub(crate) text: Cow<'a, str>,
13 pub(crate) offset_map: OffsetMap,
14 pub(crate) diagnostics: Vec<DecodeDiagnostic>,
15}
16
17pub(crate) struct DecodedOwnedSource {
18 pub(crate) encoding: SourceEncoding,
19 pub(crate) text: String,
20 pub(crate) offset_map: OffsetMap,
21 pub(crate) diagnostics: Vec<DecodeDiagnostic>,
22}
23
24#[derive(Debug, Clone)]
25enum OffsetMapKind {
26 Identity {
27 len: usize,
28 },
29 Indexed {
30 decoded_to_source: Box<[u32]>,
31 source_to_decoded: Arc<[u32]>,
32 },
33 Sparse {
34 source_len: usize,
35 display_len: usize,
36 edits: Arc<[SourceMapEdit]>,
37 },
38}
39
40#[derive(Debug, Clone)]
41pub(crate) struct OffsetMap {
42 kind: OffsetMapKind,
43}
44
45impl OffsetMap {
46 fn identity(len: usize) -> Self {
47 Self {
48 kind: OffsetMapKind::Identity { len },
49 }
50 }
51
52 fn from_decoded_text(text: &str, source_len: usize, encoding: SourceEncoding) -> Option<Self> {
53 let mut decoded_to_source = vec![0; text.len() + 1];
54 let mut source_to_decoded = vec![0; source_len + 1];
55 let mut decoded_offset = 0usize;
56 let mut source_offset = 0usize;
57
58 for ch in text.chars() {
59 let decoded_len = ch.len_utf8();
60 let source_char_len = source_len_for_char(ch, encoding)?;
61 let source_end = source_offset.saturating_add(source_char_len);
62 let decoded_end = decoded_offset.saturating_add(decoded_len);
63 for step in 1..=decoded_len {
64 decoded_to_source[decoded_offset + step] =
65 u32::try_from(source_end).unwrap_or(u32::MAX);
66 }
67 for step in 1..=source_char_len {
68 source_to_decoded[source_offset + step] =
69 u32::try_from(decoded_end).unwrap_or(u32::MAX);
70 }
71 decoded_offset += decoded_len;
72 source_offset = source_end;
73 }
74
75 if source_offset != source_len {
76 return None;
77 }
78
79 decoded_to_source[text.len()] = u32::try_from(source_len).unwrap_or(u32::MAX);
80 source_to_decoded[source_len] = u32::try_from(text.len()).unwrap_or(u32::MAX);
81 Some(Self {
82 kind: OffsetMapKind::Indexed {
83 decoded_to_source: decoded_to_source.into_boxed_slice(),
84 source_to_decoded: Arc::from(source_to_decoded),
85 },
86 })
87 }
88
89 fn map_offset(&self, offset: u32) -> u32 {
90 match &self.kind {
91 OffsetMapKind::Identity { len } => {
92 u32::try_from(usize::try_from(offset).unwrap_or(*len).min(*len)).unwrap_or(u32::MAX)
93 }
94 OffsetMapKind::Indexed {
95 decoded_to_source, ..
96 } => decoded_to_source
97 .get(offset as usize)
98 .copied()
99 .or_else(|| decoded_to_source.last().copied())
100 .unwrap_or(offset),
101 OffsetMapKind::Sparse {
102 source_len,
103 display_len,
104 edits,
105 } => sparse_display_to_source(*source_len, *display_len, edits, offset as usize),
106 }
107 }
108
109 pub(crate) fn map_range(&self, range: TextRange) -> TextRange {
110 text_range(
111 self.map_offset(range_start(range)),
112 self.map_offset(range_end(range)),
113 )
114 }
115
116 pub(crate) fn source_map(&self) -> mel_syntax::SourceMap {
117 match &self.kind {
118 OffsetMapKind::Identity { len } => mel_syntax::SourceMap::identity(*len),
119 OffsetMapKind::Indexed {
120 source_to_decoded, ..
121 } => {
122 mel_syntax::SourceMap::from_shared_source_to_display(Arc::clone(source_to_decoded))
123 }
124 OffsetMapKind::Sparse {
125 source_len,
126 display_len,
127 edits,
128 } => mel_syntax::SourceMap::from_sparse_edits(
129 *source_len,
130 *display_len,
131 Arc::clone(edits),
132 ),
133 }
134 }
135}
136
137pub(crate) fn decode_source_auto(input: &[u8]) -> DecodedSource<'_> {
138 match std::str::from_utf8(input) {
139 Ok(text) => DecodedSource {
140 encoding: SourceEncoding::Utf8,
141 text: Cow::Borrowed(text),
142 offset_map: OffsetMap::identity(text.len()),
143 diagnostics: Vec::new(),
144 },
145 Err(error) => decode_source_auto_with_error(input, error),
146 }
147}
148
149pub(crate) fn decode_owned_bytes_auto(input: Vec<u8>) -> DecodedOwnedSource {
150 match String::from_utf8(input) {
151 Ok(text) => {
152 let len = text.len();
153 DecodedOwnedSource {
154 encoding: SourceEncoding::Utf8,
155 text,
156 offset_map: OffsetMap::identity(len),
157 diagnostics: Vec::new(),
158 }
159 }
160 Err(error) => decode_source_auto(error.as_bytes()).into_owned(),
161 }
162}
163
164fn decode_source_auto_with_error(input: &[u8], utf8_error: Utf8Error) -> DecodedSource<'_> {
165 let sample = decode_auto_sample(input, utf8_error.valid_up_to());
166 let utf8_lossy_rank = decode_utf8_lossy_sample_rank(sample);
167 let cp932_rank = decode_non_utf8_sample_rank(sample, SourceEncoding::Cp932);
168 let gbk_rank = decode_non_utf8_sample_rank(sample, SourceEncoding::Gbk);
169 let (best_encoding, best_non_utf8_rank) = if cp932_rank <= gbk_rank {
170 (SourceEncoding::Cp932, cp932_rank)
171 } else {
172 (SourceEncoding::Gbk, gbk_rank)
173 };
174
175 if best_non_utf8_rank.0 == 0 && best_non_utf8_rank.1 < utf8_lossy_rank.1 {
176 let decoded = decode_source_with_encoding(input, best_encoding);
177 if decoded.diagnostics.is_empty() {
178 return decoded;
179 }
180 }
181
182 decode_lossy_utf8_with_error(input, utf8_error.valid_up_to() as u32, utf8_error)
183}
184
185pub(crate) fn decode_source_with_encoding(
186 input: &[u8],
187 encoding: SourceEncoding,
188) -> DecodedSource<'_> {
189 if matches!(encoding, SourceEncoding::Utf8) {
190 return match std::str::from_utf8(input) {
191 Ok(text) => DecodedSource {
192 encoding,
193 text: Cow::Borrowed(text),
194 offset_map: OffsetMap::identity(text.len()),
195 diagnostics: Vec::new(),
196 },
197 Err(error) => decode_lossy_utf8_with_error(input, error.valid_up_to() as u32, error),
198 };
199 }
200
201 let encoding_rs = encoding_rs_encoding(encoding);
202 if Encoding::ascii_valid_up_to(input) == input.len() {
203 let text = std::str::from_utf8(input).unwrap_or_default();
204 return DecodedSource {
205 encoding,
206 text: Cow::Borrowed(text),
207 offset_map: OffsetMap::identity(text.len()),
208 diagnostics: Vec::new(),
209 };
210 }
211
212 let (text, _, had_errors) = encoding_rs.decode(input);
213 let offset_map = if had_errors {
214 OffsetMap::from_decoded_text(text.as_ref(), input.len(), encoding)
215 .unwrap_or_else(|| OffsetMap::identity(text.len()))
216 } else {
217 OffsetMap::from_ascii_compatible_text(input, text.as_ref(), encoding)
218 .or_else(|| OffsetMap::from_decoded_text(text.as_ref(), input.len(), encoding))
219 .unwrap_or_else(|| OffsetMap::identity(text.len()))
220 };
221 let diagnostics = if had_errors {
222 vec![DecodeDiagnostic {
223 message: format!(
224 "source is not valid {}; decoded with replacement",
225 encoding.label()
226 )
227 .into(),
228 range: text_range(0, input.len() as u32),
229 }]
230 } else {
231 Vec::new()
232 };
233
234 DecodedSource {
235 encoding,
236 text,
237 offset_map,
238 diagnostics,
239 }
240}
241
242pub(crate) fn decode_owned_bytes_with_encoding(
243 input: Vec<u8>,
244 encoding: SourceEncoding,
245) -> DecodedOwnedSource {
246 if matches!(encoding, SourceEncoding::Utf8) {
247 return match String::from_utf8(input) {
248 Ok(text) => {
249 let len = text.len();
250 DecodedOwnedSource {
251 encoding,
252 text,
253 offset_map: OffsetMap::identity(len),
254 diagnostics: Vec::new(),
255 }
256 }
257 Err(error) => {
258 decode_source_with_encoding(error.as_bytes(), SourceEncoding::Utf8).into_owned()
259 }
260 };
261 }
262
263 if Encoding::ascii_valid_up_to(&input) == input.len() {
264 let text = String::from_utf8(input).unwrap_or_default();
265 let len = text.len();
266 return DecodedOwnedSource {
267 encoding,
268 text,
269 offset_map: OffsetMap::identity(len),
270 diagnostics: Vec::new(),
271 };
272 }
273
274 decode_source_with_encoding(&input, encoding).into_owned()
275}
276
277impl DecodedSource<'_> {
278 fn into_owned(self) -> DecodedOwnedSource {
279 DecodedOwnedSource {
280 encoding: self.encoding,
281 text: self.text.into_owned(),
282 offset_map: self.offset_map,
283 diagnostics: self.diagnostics,
284 }
285 }
286}
287
288fn sparse_display_to_source(
289 source_len: usize,
290 display_len: usize,
291 edits: &[SourceMapEdit],
292 offset: usize,
293) -> u32 {
294 let clamped = offset.min(display_len) as u32;
295 let Some(index) = edits
296 .partition_point(|edit| edit.display_start() <= clamped)
297 .checked_sub(1)
298 else {
299 return clamped;
300 };
301 let edit = edits[index];
302 if clamped == edit.display_start() {
303 return edit.source_start();
304 }
305 if clamped <= edit.display_end() {
306 return edit.source_end();
307 }
308 let mapped = (clamped as i64 - (edit.display_end() as i64 - edit.source_end() as i64))
309 .clamp(0, source_len as i64);
310 mapped as u32
311}
312
313fn decode_auto_sample(input: &[u8], valid_up_to: usize) -> &[u8] {
314 const SAMPLE_PREFIX_CONTEXT: usize = 256;
315 const SAMPLE_MAX_BYTES: usize = 64 * 1024;
316
317 let start = valid_up_to.saturating_sub(SAMPLE_PREFIX_CONTEXT);
318 let end = input.len().min(start.saturating_add(SAMPLE_MAX_BYTES));
319 &input[start..end]
320}
321
322fn decode_utf8_lossy_sample_rank(sample: &[u8]) -> (u8, usize, u8) {
323 let text = String::from_utf8_lossy(sample);
324 (
325 1,
326 suspicious_text_score(text.as_ref()),
327 decode_encoding_bias(SourceEncoding::Utf8),
328 )
329}
330
331fn decode_non_utf8_sample_rank(sample: &[u8], encoding: SourceEncoding) -> (u8, usize, u8) {
332 let (text, _, had_errors) = encoding_rs_encoding(encoding).decode(sample);
333 (
334 u8::from(had_errors),
335 suspicious_text_score(text.as_ref()),
336 decode_encoding_bias(encoding),
337 )
338}
339
340fn decode_encoding_bias(encoding: SourceEncoding) -> u8 {
341 match encoding {
342 SourceEncoding::Cp932 => 0,
343 SourceEncoding::Gbk => 1,
344 SourceEncoding::Utf8 => 2,
345 }
346}
347
348fn suspicious_text_score(text: &str) -> usize {
349 text.chars().map(suspicious_char_weight).sum()
350}
351
352fn suspicious_char_weight(ch: char) -> usize {
353 match ch {
354 '\u{FFFD}' => 1,
355 '\u{0080}'..='\u{009F}' => 1,
356 '\u{E000}'..='\u{F8FF}' => 1,
357 '\u{FF61}'..='\u{FF9F}' => 1,
358 _ => 0,
359 }
360}
361
362fn decode_lossy_utf8_with_error(
363 input: &[u8],
364 start: u32,
365 error: std::str::Utf8Error,
366) -> DecodedSource<'_> {
367 let end = error
368 .error_len()
369 .map_or(input.len() as u32, |len| start + len as u32);
370 let (text, offset_map) = decode_lossy_utf8_text_and_offset_map(input);
371
372 DecodedSource {
373 encoding: SourceEncoding::Utf8,
374 offset_map,
375 text: Cow::Owned(text),
376 diagnostics: vec![DecodeDiagnostic {
377 message: "source is not valid UTF-8; decoded lossily".into(),
378 range: text_range(start, end),
379 }],
380 }
381}
382
383fn decode_lossy_utf8_text_and_offset_map(input: &[u8]) -> (String, OffsetMap) {
384 let mut text = String::new();
385 let mut decoded_to_source = vec![0];
386 let mut source_to_decoded = vec![0; input.len() + 1];
387 let mut source_offset = 0usize;
388
389 while source_offset < input.len() {
390 match std::str::from_utf8(&input[source_offset..]) {
391 Ok(valid) => {
392 for ch in valid.chars() {
393 append_decoded_char_mapping(
394 &mut text,
395 &mut decoded_to_source,
396 &mut source_to_decoded,
397 source_offset,
398 ch.len_utf8(),
399 ch,
400 );
401 source_offset += ch.len_utf8();
402 }
403 break;
404 }
405 Err(error) => {
406 let valid_up_to = error.valid_up_to();
407 if valid_up_to > 0 {
408 let valid =
409 std::str::from_utf8(&input[source_offset..source_offset + valid_up_to])
410 .unwrap_or_default();
411 for ch in valid.chars() {
412 append_decoded_char_mapping(
413 &mut text,
414 &mut decoded_to_source,
415 &mut source_to_decoded,
416 source_offset,
417 ch.len_utf8(),
418 ch,
419 );
420 source_offset += ch.len_utf8();
421 }
422 }
423
424 let invalid_len = error.error_len().unwrap_or(input.len() - source_offset);
425 append_decoded_char_mapping(
426 &mut text,
427 &mut decoded_to_source,
428 &mut source_to_decoded,
429 source_offset,
430 invalid_len,
431 char::REPLACEMENT_CHARACTER,
432 );
433 source_offset += invalid_len;
434 }
435 }
436 }
437
438 (
439 text,
440 OffsetMap {
441 kind: OffsetMapKind::Indexed {
442 decoded_to_source: decoded_to_source.into_boxed_slice(),
443 source_to_decoded: Arc::from(source_to_decoded),
444 },
445 },
446 )
447}
448
449fn append_decoded_char_mapping(
450 text: &mut String,
451 decoded_to_source: &mut Vec<u32>,
452 source_to_decoded: &mut [u32],
453 source_start: usize,
454 source_len: usize,
455 ch: char,
456) {
457 let decoded_start = text.len();
458 let source_end = source_start + source_len;
459
460 text.push(ch);
461 let decoded_end = text.len();
462 decoded_to_source.resize(decoded_end + 1, source_end as u32);
463 for mapped in decoded_to_source
464 .iter_mut()
465 .take(decoded_end + 1)
466 .skip(decoded_start + 1)
467 {
468 *mapped = source_end as u32;
469 }
470
471 for mapped in source_to_decoded
472 .iter_mut()
473 .take(source_end + 1)
474 .skip(source_start + 1)
475 {
476 *mapped = decoded_end as u32;
477 }
478}
479
480impl OffsetMap {
481 fn from_ascii_compatible_text(
482 input: &[u8],
483 text: &str,
484 encoding: SourceEncoding,
485 ) -> Option<Self> {
486 let mut source_offset = 0usize;
487 let mut display_offset = 0usize;
488 let mut edits = Vec::new();
489
490 while source_offset < input.len() || display_offset < text.len() {
491 let ascii_run = Encoding::ascii_valid_up_to(&input[source_offset..]);
492 if ascii_run > 0 {
493 source_offset += ascii_run;
494 display_offset += ascii_run;
495 continue;
496 }
497
498 let run_display_end = next_ascii_display_boundary(text, display_offset);
499 let display_len = run_display_end.saturating_sub(display_offset);
500 let display_run = &text[display_offset..run_display_end];
501 let source_len =
502 source_len_for_decoded_run(&input[source_offset..], display_run, encoding)?;
503 if source_len != display_len {
504 edits.push(SourceMapEdit::new(
505 u32::try_from(source_offset).unwrap_or(u32::MAX),
506 u32::try_from(source_offset + source_len).unwrap_or(u32::MAX),
507 u32::try_from(display_offset).unwrap_or(u32::MAX),
508 u32::try_from(run_display_end).unwrap_or(u32::MAX),
509 ));
510 }
511 source_offset += source_len;
512 display_offset = run_display_end;
513 }
514
515 if source_offset != input.len() || display_offset != text.len() {
516 return None;
517 }
518
519 if edits.is_empty() && input.len() == text.len() {
520 return Some(Self::identity(text.len()));
521 }
522
523 Some(Self {
524 kind: OffsetMapKind::Sparse {
525 source_len: input.len(),
526 display_len: text.len(),
527 edits: Arc::from(edits),
528 },
529 })
530 }
531}
532
533fn next_ascii_display_boundary(text: &str, display_offset: usize) -> usize {
534 let mut end = display_offset;
535 for ch in text[display_offset..].chars() {
536 if ch.is_ascii() {
537 break;
538 }
539 end += ch.len_utf8();
540 }
541 end
542}
543
544fn source_len_for_decoded_run(
545 input: &[u8],
546 display_run: &str,
547 encoding: SourceEncoding,
548) -> Option<usize> {
549 let mut decoder = encoding_rs_encoding(encoding).new_decoder_without_bom_handling();
550 let mut output = vec![0; display_run.len()];
551 let (result, read, written) =
552 decoder.decode_to_utf8_without_replacement(input, &mut output, false);
553
554 match result {
555 DecoderResult::InputEmpty | DecoderResult::OutputFull => (written == display_run.len()
556 && &output[..written] == display_run.as_bytes())
557 .then_some(read),
558 DecoderResult::Malformed(_, _) => None,
559 }
560}
561
562impl SourceEncoding {
563 #[must_use]
564 pub const fn label(self) -> &'static str {
565 match self {
566 Self::Utf8 => "utf-8",
567 Self::Cp932 => "cp932",
568 Self::Gbk => "gbk",
569 }
570 }
571}
572
573fn encoding_rs_encoding(encoding: SourceEncoding) -> &'static Encoding {
574 match encoding {
575 SourceEncoding::Utf8 => encoding_rs::UTF_8,
576 SourceEncoding::Cp932 => SHIFT_JIS,
577 SourceEncoding::Gbk => GBK,
578 }
579}
580
581fn source_len_for_char(ch: char, encoding: SourceEncoding) -> Option<usize> {
582 if matches!(encoding, SourceEncoding::Utf8) {
583 return Some(ch.len_utf8());
584 }
585
586 let mut text = String::new();
587 text.push(ch);
588 let (encoded, _, had_errors) = encoding_rs_encoding(encoding).encode(&text);
589 (!had_errors).then(|| encoded.len())
590}