1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
use super::utils::{char_orig_len, handle_control_char};
use super::{ControlCharStyle, InvalidSeqStyle, Snippet, char_should_be_replaced};
impl Snippet {
/// Creates a [`Snippet`] from a UTF-16 (possibly invalid) source.
///
/// # Source units and spans
///
/// The *source unit* for this builder is a **16-bit word** of the original
/// `source`. Any annotation span you pass later (a `Range<usize>`) is
/// interpreted as word offsets into this original source sequence.
///
/// # Line breaks
///
/// - `\n` and `\r\n` are treated as line breaks.
/// - A lone `\r` is *not* a line break; it is handled like any other control
/// character.
///
/// # Control characters
///
/// Control characters are those for which
/// [`char_should_be_replaced()`](crate::char_should_be_replaced)
/// returns `true`.
///
/// - Tabs (U+0009) are replaced with `tab_width` spaces.
/// - ZERO WIDTH JOINER (U+200D) is replaced with nothing (but still accounts
/// for its original source unit length).
/// - When `control_char_style` is [`ControlCharStyle::Replacement`], C0
/// controls (U+0000 to U+001F, excluding tab) and DEL (U+007F) are
/// replaced with their Unicode Control Pictures (␀, ␁, ...).
/// - Any other control character, and C0 controls when `control_char_style`
/// is [`ControlCharStyle::Codepoint`], are represented with the hexadecimal
/// value of their code point, in angle brackets, with at least four digits
/// (`<U+XXXX>`).
///
/// Control characters are rendered as alternate text when `control_char_alt` is
/// `true`, with the exception of tabs, which are never marked as alternate text.
///
/// # Invalid UTF-16
///
/// - When `invalid_seq_style` is [`InvalidSeqStyle::Replacement`], each
/// unpaired surrogate word is replaced with the Unicode Replacement Character
/// (U+FFFD, `�`).
/// - When `invalid_seq_style` is [`InvalidSeqStyle::Hexadecimal`], each
/// unpaired surrogate word is represented with its hexadecimal value, in
/// angle brackets, with four digits (`<XXXX>`).
///
/// If `invalid_seq_alt` is `true`, the replacement fragments are marked as
/// "alternate" text.
pub fn with_utf16_words<I>(
start_line: usize,
source: I,
tab_width: usize,
control_char_style: ControlCharStyle,
control_char_alt: bool,
invalid_seq_style: InvalidSeqStyle,
invalid_seq_alt: bool,
) -> Self
where
I: IntoIterator<Item = u16>,
{
let mut builder = Snippet::builder(start_line);
let mut dec_iter = char::decode_utf16(source);
if let Some(mut cur_dec) = dec_iter.next() {
loop {
let mut next_dec = dec_iter.next();
match cur_dec {
Ok(cur_chr) => {
if cur_chr == '\r' && next_dec == Some(Ok('\n')) {
builder.next_line(2);
next_dec = dec_iter.next();
} else if cur_chr == '\n' {
builder.next_line(1);
} else if cur_chr == '\t' {
builder.push_spaces(tab_width, 1, false);
} else if char_should_be_replaced(cur_chr) {
handle_control_char::<16>(
&mut builder,
cur_chr,
control_char_style,
control_char_alt,
);
} else {
let orig_len = char_orig_len::<16>(cur_chr);
builder.push_char(cur_chr, orig_len, false);
}
}
Err(e) => match invalid_seq_style {
InvalidSeqStyle::Replacement => {
builder.push_char('\u{FFFD}', 1, invalid_seq_alt);
}
InvalidSeqStyle::Hexadecimal => {
let word = e.unpaired_surrogate();
builder.push_fmt(format_args!("<{word:04X}>"), 1, invalid_seq_alt);
}
},
}
cur_dec = match next_dec {
Some(dec) => dec,
None => break,
};
}
}
builder.finish()
}
}