Skip to main content

maya_mel/parser/
decode.rs

1use encoding_rs::{DecoderResult, Encoding, GBK, SHIFT_JIS};
2use std::borrow::Cow;
3use std::str::Utf8Error;
4use std::sync::Arc;
5
6use mel_syntax::{SourceMapEdit, TextRange, range_end, range_start, text_range};
7
8use crate::{DecodeDiagnostic, SourceEncoding};
9
10pub(crate) struct DecodedSource<'a> {
11    pub(crate) encoding: SourceEncoding,
12    pub(crate) text: Cow<'a, str>,
13    pub(crate) offset_map: OffsetMap,
14    pub(crate) diagnostics: Vec<DecodeDiagnostic>,
15}
16
17pub(crate) struct DecodedOwnedSource {
18    pub(crate) encoding: SourceEncoding,
19    pub(crate) text: String,
20    pub(crate) offset_map: OffsetMap,
21    pub(crate) diagnostics: Vec<DecodeDiagnostic>,
22}
23
24#[derive(Debug, Clone)]
25enum OffsetMapKind {
26    Identity {
27        len: usize,
28    },
29    Indexed {
30        decoded_to_source: Box<[u32]>,
31        source_to_decoded: Arc<[u32]>,
32    },
33    Sparse {
34        source_len: usize,
35        display_len: usize,
36        edits: Arc<[SourceMapEdit]>,
37    },
38}
39
40#[derive(Debug, Clone)]
41pub(crate) struct OffsetMap {
42    kind: OffsetMapKind,
43}
44
45impl OffsetMap {
46    fn identity(len: usize) -> Self {
47        Self {
48            kind: OffsetMapKind::Identity { len },
49        }
50    }
51
52    fn from_decoded_text(text: &str, source_len: usize, encoding: SourceEncoding) -> Option<Self> {
53        let mut decoded_to_source = vec![0; text.len() + 1];
54        let mut source_to_decoded = vec![0; source_len + 1];
55        let mut decoded_offset = 0usize;
56        let mut source_offset = 0usize;
57
58        for ch in text.chars() {
59            let decoded_len = ch.len_utf8();
60            let source_char_len = source_len_for_char(ch, encoding)?;
61            let source_end = source_offset.saturating_add(source_char_len);
62            let decoded_end = decoded_offset.saturating_add(decoded_len);
63            for step in 1..=decoded_len {
64                decoded_to_source[decoded_offset + step] =
65                    u32::try_from(source_end).unwrap_or(u32::MAX);
66            }
67            for step in 1..=source_char_len {
68                source_to_decoded[source_offset + step] =
69                    u32::try_from(decoded_end).unwrap_or(u32::MAX);
70            }
71            decoded_offset += decoded_len;
72            source_offset = source_end;
73        }
74
75        if source_offset != source_len {
76            return None;
77        }
78
79        decoded_to_source[text.len()] = u32::try_from(source_len).unwrap_or(u32::MAX);
80        source_to_decoded[source_len] = u32::try_from(text.len()).unwrap_or(u32::MAX);
81        Some(Self {
82            kind: OffsetMapKind::Indexed {
83                decoded_to_source: decoded_to_source.into_boxed_slice(),
84                source_to_decoded: Arc::from(source_to_decoded),
85            },
86        })
87    }
88
89    fn map_offset(&self, offset: u32) -> u32 {
90        match &self.kind {
91            OffsetMapKind::Identity { len } => {
92                u32::try_from(usize::try_from(offset).unwrap_or(*len).min(*len)).unwrap_or(u32::MAX)
93            }
94            OffsetMapKind::Indexed {
95                decoded_to_source, ..
96            } => decoded_to_source
97                .get(offset as usize)
98                .copied()
99                .or_else(|| decoded_to_source.last().copied())
100                .unwrap_or(offset),
101            OffsetMapKind::Sparse {
102                source_len,
103                display_len,
104                edits,
105            } => sparse_display_to_source(*source_len, *display_len, edits, offset as usize),
106        }
107    }
108
109    pub(crate) fn map_range(&self, range: TextRange) -> TextRange {
110        text_range(
111            self.map_offset(range_start(range)),
112            self.map_offset(range_end(range)),
113        )
114    }
115
116    pub(crate) fn source_map(&self) -> mel_syntax::SourceMap {
117        match &self.kind {
118            OffsetMapKind::Identity { len } => mel_syntax::SourceMap::identity(*len),
119            OffsetMapKind::Indexed {
120                source_to_decoded, ..
121            } => {
122                mel_syntax::SourceMap::from_shared_source_to_display(Arc::clone(source_to_decoded))
123            }
124            OffsetMapKind::Sparse {
125                source_len,
126                display_len,
127                edits,
128            } => mel_syntax::SourceMap::from_sparse_edits(
129                *source_len,
130                *display_len,
131                Arc::clone(edits),
132            ),
133        }
134    }
135}
136
137pub(crate) fn decode_source_auto(input: &[u8]) -> DecodedSource<'_> {
138    match std::str::from_utf8(input) {
139        Ok(text) => DecodedSource {
140            encoding: SourceEncoding::Utf8,
141            text: Cow::Borrowed(text),
142            offset_map: OffsetMap::identity(text.len()),
143            diagnostics: Vec::new(),
144        },
145        Err(error) => decode_source_auto_with_error(input, error),
146    }
147}
148
149pub(crate) fn decode_owned_bytes_auto(input: Vec<u8>) -> DecodedOwnedSource {
150    match String::from_utf8(input) {
151        Ok(text) => {
152            let len = text.len();
153            DecodedOwnedSource {
154                encoding: SourceEncoding::Utf8,
155                text,
156                offset_map: OffsetMap::identity(len),
157                diagnostics: Vec::new(),
158            }
159        }
160        Err(error) => decode_source_auto(error.as_bytes()).into_owned(),
161    }
162}
163
164fn decode_source_auto_with_error(input: &[u8], utf8_error: Utf8Error) -> DecodedSource<'_> {
165    let sample = decode_auto_sample(input, utf8_error.valid_up_to());
166    let utf8_lossy_rank = decode_utf8_lossy_sample_rank(sample);
167    let cp932_rank = decode_non_utf8_sample_rank(sample, SourceEncoding::Cp932);
168    let gbk_rank = decode_non_utf8_sample_rank(sample, SourceEncoding::Gbk);
169    let (best_encoding, best_non_utf8_rank) = if cp932_rank <= gbk_rank {
170        (SourceEncoding::Cp932, cp932_rank)
171    } else {
172        (SourceEncoding::Gbk, gbk_rank)
173    };
174
175    if best_non_utf8_rank.0 == 0 && best_non_utf8_rank.1 < utf8_lossy_rank.1 {
176        let decoded = decode_source_with_encoding(input, best_encoding);
177        if decoded.diagnostics.is_empty() {
178            return decoded;
179        }
180    }
181
182    decode_lossy_utf8_with_error(input, utf8_error.valid_up_to() as u32, utf8_error)
183}
184
185pub(crate) fn decode_source_with_encoding(
186    input: &[u8],
187    encoding: SourceEncoding,
188) -> DecodedSource<'_> {
189    if matches!(encoding, SourceEncoding::Utf8) {
190        return match std::str::from_utf8(input) {
191            Ok(text) => DecodedSource {
192                encoding,
193                text: Cow::Borrowed(text),
194                offset_map: OffsetMap::identity(text.len()),
195                diagnostics: Vec::new(),
196            },
197            Err(error) => decode_lossy_utf8_with_error(input, error.valid_up_to() as u32, error),
198        };
199    }
200
201    let encoding_rs = encoding_rs_encoding(encoding);
202    if Encoding::ascii_valid_up_to(input) == input.len() {
203        let text = std::str::from_utf8(input).unwrap_or_default();
204        return DecodedSource {
205            encoding,
206            text: Cow::Borrowed(text),
207            offset_map: OffsetMap::identity(text.len()),
208            diagnostics: Vec::new(),
209        };
210    }
211
212    let (text, _, had_errors) = encoding_rs.decode(input);
213    let offset_map = if had_errors {
214        OffsetMap::from_decoded_text(text.as_ref(), input.len(), encoding)
215            .unwrap_or_else(|| OffsetMap::identity(text.len()))
216    } else {
217        OffsetMap::from_ascii_compatible_text(input, text.as_ref(), encoding)
218            .or_else(|| OffsetMap::from_decoded_text(text.as_ref(), input.len(), encoding))
219            .unwrap_or_else(|| OffsetMap::identity(text.len()))
220    };
221    let diagnostics = if had_errors {
222        vec![DecodeDiagnostic {
223            message: format!(
224                "source is not valid {}; decoded with replacement",
225                encoding.label()
226            )
227            .into(),
228            range: text_range(0, input.len() as u32),
229        }]
230    } else {
231        Vec::new()
232    };
233
234    DecodedSource {
235        encoding,
236        text,
237        offset_map,
238        diagnostics,
239    }
240}
241
242pub(crate) fn decode_owned_bytes_with_encoding(
243    input: Vec<u8>,
244    encoding: SourceEncoding,
245) -> DecodedOwnedSource {
246    if matches!(encoding, SourceEncoding::Utf8) {
247        return match String::from_utf8(input) {
248            Ok(text) => {
249                let len = text.len();
250                DecodedOwnedSource {
251                    encoding,
252                    text,
253                    offset_map: OffsetMap::identity(len),
254                    diagnostics: Vec::new(),
255                }
256            }
257            Err(error) => {
258                decode_source_with_encoding(error.as_bytes(), SourceEncoding::Utf8).into_owned()
259            }
260        };
261    }
262
263    if Encoding::ascii_valid_up_to(&input) == input.len() {
264        let text = String::from_utf8(input).unwrap_or_default();
265        let len = text.len();
266        return DecodedOwnedSource {
267            encoding,
268            text,
269            offset_map: OffsetMap::identity(len),
270            diagnostics: Vec::new(),
271        };
272    }
273
274    decode_source_with_encoding(&input, encoding).into_owned()
275}
276
277impl DecodedSource<'_> {
278    fn into_owned(self) -> DecodedOwnedSource {
279        DecodedOwnedSource {
280            encoding: self.encoding,
281            text: self.text.into_owned(),
282            offset_map: self.offset_map,
283            diagnostics: self.diagnostics,
284        }
285    }
286}
287
288fn sparse_display_to_source(
289    source_len: usize,
290    display_len: usize,
291    edits: &[SourceMapEdit],
292    offset: usize,
293) -> u32 {
294    let clamped = offset.min(display_len) as u32;
295    let Some(index) = edits
296        .partition_point(|edit| edit.display_start() <= clamped)
297        .checked_sub(1)
298    else {
299        return clamped;
300    };
301    let edit = edits[index];
302    if clamped == edit.display_start() {
303        return edit.source_start();
304    }
305    if clamped <= edit.display_end() {
306        return edit.source_end();
307    }
308    let mapped = (clamped as i64 - (edit.display_end() as i64 - edit.source_end() as i64))
309        .clamp(0, source_len as i64);
310    mapped as u32
311}
312
313fn decode_auto_sample(input: &[u8], valid_up_to: usize) -> &[u8] {
314    const SAMPLE_PREFIX_CONTEXT: usize = 256;
315    const SAMPLE_MAX_BYTES: usize = 64 * 1024;
316
317    let start = valid_up_to.saturating_sub(SAMPLE_PREFIX_CONTEXT);
318    let end = input.len().min(start.saturating_add(SAMPLE_MAX_BYTES));
319    &input[start..end]
320}
321
322fn decode_utf8_lossy_sample_rank(sample: &[u8]) -> (u8, usize, u8) {
323    let text = String::from_utf8_lossy(sample);
324    (
325        1,
326        suspicious_text_score(text.as_ref()),
327        decode_encoding_bias(SourceEncoding::Utf8),
328    )
329}
330
331fn decode_non_utf8_sample_rank(sample: &[u8], encoding: SourceEncoding) -> (u8, usize, u8) {
332    let (text, _, had_errors) = encoding_rs_encoding(encoding).decode(sample);
333    (
334        u8::from(had_errors),
335        suspicious_text_score(text.as_ref()),
336        decode_encoding_bias(encoding),
337    )
338}
339
340fn decode_encoding_bias(encoding: SourceEncoding) -> u8 {
341    match encoding {
342        SourceEncoding::Cp932 => 0,
343        SourceEncoding::Gbk => 1,
344        SourceEncoding::Utf8 => 2,
345    }
346}
347
348fn suspicious_text_score(text: &str) -> usize {
349    text.chars().map(suspicious_char_weight).sum()
350}
351
352fn suspicious_char_weight(ch: char) -> usize {
353    match ch {
354        '\u{FFFD}' => 1,
355        '\u{0080}'..='\u{009F}' => 1,
356        '\u{E000}'..='\u{F8FF}' => 1,
357        '\u{FF61}'..='\u{FF9F}' => 1,
358        _ => 0,
359    }
360}
361
362fn decode_lossy_utf8_with_error(
363    input: &[u8],
364    start: u32,
365    error: std::str::Utf8Error,
366) -> DecodedSource<'_> {
367    let end = error
368        .error_len()
369        .map_or(input.len() as u32, |len| start + len as u32);
370    let (text, offset_map) = decode_lossy_utf8_text_and_offset_map(input);
371
372    DecodedSource {
373        encoding: SourceEncoding::Utf8,
374        offset_map,
375        text: Cow::Owned(text),
376        diagnostics: vec![DecodeDiagnostic {
377            message: "source is not valid UTF-8; decoded lossily".into(),
378            range: text_range(start, end),
379        }],
380    }
381}
382
383fn decode_lossy_utf8_text_and_offset_map(input: &[u8]) -> (String, OffsetMap) {
384    let mut text = String::new();
385    let mut decoded_to_source = vec![0];
386    let mut source_to_decoded = vec![0; input.len() + 1];
387    let mut source_offset = 0usize;
388
389    while source_offset < input.len() {
390        match std::str::from_utf8(&input[source_offset..]) {
391            Ok(valid) => {
392                for ch in valid.chars() {
393                    append_decoded_char_mapping(
394                        &mut text,
395                        &mut decoded_to_source,
396                        &mut source_to_decoded,
397                        source_offset,
398                        ch.len_utf8(),
399                        ch,
400                    );
401                    source_offset += ch.len_utf8();
402                }
403                break;
404            }
405            Err(error) => {
406                let valid_up_to = error.valid_up_to();
407                if valid_up_to > 0 {
408                    let valid =
409                        std::str::from_utf8(&input[source_offset..source_offset + valid_up_to])
410                            .unwrap_or_default();
411                    for ch in valid.chars() {
412                        append_decoded_char_mapping(
413                            &mut text,
414                            &mut decoded_to_source,
415                            &mut source_to_decoded,
416                            source_offset,
417                            ch.len_utf8(),
418                            ch,
419                        );
420                        source_offset += ch.len_utf8();
421                    }
422                }
423
424                let invalid_len = error.error_len().unwrap_or(input.len() - source_offset);
425                append_decoded_char_mapping(
426                    &mut text,
427                    &mut decoded_to_source,
428                    &mut source_to_decoded,
429                    source_offset,
430                    invalid_len,
431                    char::REPLACEMENT_CHARACTER,
432                );
433                source_offset += invalid_len;
434            }
435        }
436    }
437
438    (
439        text,
440        OffsetMap {
441            kind: OffsetMapKind::Indexed {
442                decoded_to_source: decoded_to_source.into_boxed_slice(),
443                source_to_decoded: Arc::from(source_to_decoded),
444            },
445        },
446    )
447}
448
449fn append_decoded_char_mapping(
450    text: &mut String,
451    decoded_to_source: &mut Vec<u32>,
452    source_to_decoded: &mut [u32],
453    source_start: usize,
454    source_len: usize,
455    ch: char,
456) {
457    let decoded_start = text.len();
458    let source_end = source_start + source_len;
459
460    text.push(ch);
461    let decoded_end = text.len();
462    decoded_to_source.resize(decoded_end + 1, source_end as u32);
463    for mapped in decoded_to_source
464        .iter_mut()
465        .take(decoded_end + 1)
466        .skip(decoded_start + 1)
467    {
468        *mapped = source_end as u32;
469    }
470
471    for mapped in source_to_decoded
472        .iter_mut()
473        .take(source_end + 1)
474        .skip(source_start + 1)
475    {
476        *mapped = decoded_end as u32;
477    }
478}
479
480impl OffsetMap {
481    fn from_ascii_compatible_text(
482        input: &[u8],
483        text: &str,
484        encoding: SourceEncoding,
485    ) -> Option<Self> {
486        let mut source_offset = 0usize;
487        let mut display_offset = 0usize;
488        let mut edits = Vec::new();
489
490        while source_offset < input.len() || display_offset < text.len() {
491            let ascii_run = Encoding::ascii_valid_up_to(&input[source_offset..]);
492            if ascii_run > 0 {
493                source_offset += ascii_run;
494                display_offset += ascii_run;
495                continue;
496            }
497
498            let run_display_end = next_ascii_display_boundary(text, display_offset);
499            let display_len = run_display_end.saturating_sub(display_offset);
500            let display_run = &text[display_offset..run_display_end];
501            let source_len =
502                source_len_for_decoded_run(&input[source_offset..], display_run, encoding)?;
503            if source_len != display_len {
504                edits.push(SourceMapEdit::new(
505                    u32::try_from(source_offset).unwrap_or(u32::MAX),
506                    u32::try_from(source_offset + source_len).unwrap_or(u32::MAX),
507                    u32::try_from(display_offset).unwrap_or(u32::MAX),
508                    u32::try_from(run_display_end).unwrap_or(u32::MAX),
509                ));
510            }
511            source_offset += source_len;
512            display_offset = run_display_end;
513        }
514
515        if source_offset != input.len() || display_offset != text.len() {
516            return None;
517        }
518
519        if edits.is_empty() && input.len() == text.len() {
520            return Some(Self::identity(text.len()));
521        }
522
523        Some(Self {
524            kind: OffsetMapKind::Sparse {
525                source_len: input.len(),
526                display_len: text.len(),
527                edits: Arc::from(edits),
528            },
529        })
530    }
531}
532
533fn next_ascii_display_boundary(text: &str, display_offset: usize) -> usize {
534    let mut end = display_offset;
535    for ch in text[display_offset..].chars() {
536        if ch.is_ascii() {
537            break;
538        }
539        end += ch.len_utf8();
540    }
541    end
542}
543
544fn source_len_for_decoded_run(
545    input: &[u8],
546    display_run: &str,
547    encoding: SourceEncoding,
548) -> Option<usize> {
549    let mut decoder = encoding_rs_encoding(encoding).new_decoder_without_bom_handling();
550    let mut output = vec![0; display_run.len()];
551    let (result, read, written) =
552        decoder.decode_to_utf8_without_replacement(input, &mut output, false);
553
554    match result {
555        DecoderResult::InputEmpty | DecoderResult::OutputFull => (written == display_run.len()
556            && &output[..written] == display_run.as_bytes())
557        .then_some(read),
558        DecoderResult::Malformed(_, _) => None,
559    }
560}
561
562impl SourceEncoding {
563    #[must_use]
564    pub const fn label(self) -> &'static str {
565        match self {
566            Self::Utf8 => "utf-8",
567            Self::Cp932 => "cp932",
568            Self::Gbk => "gbk",
569        }
570    }
571}
572
573fn encoding_rs_encoding(encoding: SourceEncoding) -> &'static Encoding {
574    match encoding {
575        SourceEncoding::Utf8 => encoding_rs::UTF_8,
576        SourceEncoding::Cp932 => SHIFT_JIS,
577        SourceEncoding::Gbk => GBK,
578    }
579}
580
581fn source_len_for_char(ch: char, encoding: SourceEncoding) -> Option<usize> {
582    if matches!(encoding, SourceEncoding::Utf8) {
583        return Some(ch.len_utf8());
584    }
585
586    let mut text = String::new();
587    text.push(ch);
588    let (encoded, _, had_errors) = encoding_rs_encoding(encoding).encode(&text);
589    (!had_errors).then(|| encoded.len())
590}