1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
//! Emacs internal character encoding.
//!
//! Implements the encoding described in GNU Emacs `character.h` / `character.c`.
//! This is a UTF-8 superset: standard Unicode code points use normal UTF-8,
//! while raw bytes 0x80..0xFF are represented as "eight-bit" characters in the
//! range 0x3FFF00..0x3FFFFF, encoded with overlong 2-byte sequences (C0/C1
//! lead bytes) that are illegal in standard UTF-8.
//!
//! Encoding table:
//!
//! | Character Range | Bytes | Encoding |
//! |-------------------------|-------|-------------------------------------------------|
//! | U+0000 .. U+007F | 1 | 0xxxxxxx (ASCII) |
//! | U+0080 .. U+07FF | 2 | 110xxxxx 10xxxxxx (standard UTF-8) |
//! | U+0800 .. U+FFFF | 3 | 1110xxxx 10xxxxxx 10xxxxxx |
//! | U+10000 .. U+1FFFFF | 4 | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
//! | U+200000 .. U+3FFF7F | 5 | 11111000 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
//! | Raw byte 0x80..0xFF | 2 | 1100000x 10xxxxxx (overlong, NOT valid UTF-8) |
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
/// Maximum valid Emacs character code (0x3FFFFF).
pub const MAX_CHAR: u32 = 0x3F_FFFF;
/// Maximum Unicode code point (0x10FFFF).
pub const MAX_UNICODE_CHAR: u32 = 0x10_FFFF;
/// Maximum 5-byte encoded character (0x3FFF7F).
/// Characters above this (0x3FFF80..0x3FFFFF) are raw-byte ("eight-bit") characters.
pub const MAX_5_BYTE_CHAR: u32 = 0x3F_FF7F;
/// Maximum bytes needed to encode any Emacs character in multibyte form.
pub const MAX_MULTIBYTE_LENGTH: usize = 5;
/// Start of the raw-byte character range.
const BYTE8_OFFSET: u32 = 0x3F_FF00;
// ---------------------------------------------------------------------------
// Raw-byte predicates and converters
// ---------------------------------------------------------------------------
/// Return `true` if `c` is a raw-byte ("eight-bit") character,
/// i.e. in the range 0x3FFF00..0x3FFFFF.
#[inline]
pub fn char_byte8_p(c: u32) -> bool {
c > MAX_5_BYTE_CHAR
}
/// Convert a raw byte (0x00..0xFF) to its Emacs character code.
///
/// For bytes 0x00..0x7F the result is the byte itself (ASCII).
/// For bytes 0x80..0xFF the result is `byte + 0x3FFF00`.
#[inline]
pub fn byte8_to_char(byte: u8) -> u32 {
if byte >= 0x80 {
byte as u32 + BYTE8_OFFSET
} else {
byte as u32
}
}
/// Convert a raw-byte character code back to its byte value.
///
/// The caller must ensure `c` is a raw-byte character (i.e. `char_byte8_p(c)`
/// is true). For ASCII-range characters this also works (returns low byte).
#[inline]
pub fn char_to_byte8(c: u32) -> u8 {
if char_byte8_p(c) {
(c - BYTE8_OFFSET) as u8
} else {
// ASCII or shouldn't be called, but be safe.
c as u8
}
}
// ---------------------------------------------------------------------------
// Encoding helpers
// ---------------------------------------------------------------------------
/// Return the number of bytes needed to encode character `c` in Emacs
/// multibyte form.
#[inline]
pub fn char_bytes(c: u32) -> usize {
if c < 0x80 {
1
} else if c < 0x800 {
2
} else if c < 0x1_0000 {
3
} else if c < 0x20_0000 {
4
} else if c <= MAX_5_BYTE_CHAR {
5
} else {
// Raw byte: encoded as overlong 2-byte sequence.
2
}
}
/// Encode character `c` into `buf` and return the number of bytes written.
///
/// `buf` must be at least [`MAX_MULTIBYTE_LENGTH`] bytes long.
/// This mirrors GNU `CHAR_STRING`.
pub fn char_string(c: u32, buf: &mut [u8]) -> usize {
if c < 0x80 {
// ASCII
buf[0] = c as u8;
1
} else if c < 0x800 {
// 2-byte standard UTF-8
buf[0] = 0xC0 | (c >> 6) as u8;
buf[1] = 0x80 | (c & 0x3F) as u8;
2
} else if c < 0x1_0000 {
// 3-byte
buf[0] = 0xE0 | (c >> 12) as u8;
buf[1] = 0x80 | ((c >> 6) & 0x3F) as u8;
buf[2] = 0x80 | (c & 0x3F) as u8;
3
} else if c < 0x20_0000 {
// 4-byte
buf[0] = 0xF0 | (c >> 18) as u8;
buf[1] = 0x80 | ((c >> 12) & 0x3F) as u8;
buf[2] = 0x80 | ((c >> 6) & 0x3F) as u8;
buf[3] = 0x80 | (c & 0x3F) as u8;
4
} else if c <= MAX_5_BYTE_CHAR {
// 5-byte (extended Emacs range, not raw byte)
buf[0] = 0xF8;
buf[1] = 0x80 | ((c >> 18) & 0x3F) as u8;
buf[2] = 0x80 | ((c >> 12) & 0x3F) as u8;
buf[3] = 0x80 | ((c >> 6) & 0x3F) as u8;
buf[4] = 0x80 | (c & 0x3F) as u8;
5
} else {
// Raw byte (0x3FFF80..0x3FFFFF) → overlong 2-byte.
// Raw bytes 0x80..0xFF are encoded as if they were 0x00..0x7F in
// overlong form: byte 0x80 → [C0,80], byte 0xBF → [C0,BF],
// byte 0xC0 → [C1,80], byte 0xFF → [C1,BF].
let val = char_to_byte8(c).wrapping_sub(0x80);
buf[0] = 0xC0 | (val >> 6);
buf[1] = 0x80 | (val & 0x3F);
2
}
}
// ---------------------------------------------------------------------------
// Decoding helpers
// ---------------------------------------------------------------------------
/// Return `true` if `lead` is the start of a raw-byte overlong sequence.
///
/// Raw bytes are encoded as `[0xC0, 0x80..0xBF]` or `[0xC1, 0x80..0xBF]`,
/// which are the overlong representations of U+0000..U+007F and U+0040..U+007F
/// respectively. In Emacs encoding these represent raw bytes 0x80..0xFF.
#[inline]
fn is_raw_byte_lead(lead: u8) -> bool {
lead == 0xC0 || lead == 0xC1
}
/// Decode one character from `bytes`, returning `(char_code, bytes_consumed)`.
///
/// This mirrors GNU `STRING_CHAR_AND_LENGTH`. The input must be a valid Emacs
/// multibyte sequence (no bounds checking on continuation bytes beyond what the
/// lead byte promises).
pub fn string_char(bytes: &[u8]) -> (u32, usize) {
debug_assert!(!bytes.is_empty());
let b0 = bytes[0];
if b0 < 0x80 {
// ASCII
(b0 as u32, 1)
} else if is_raw_byte_lead(b0) && bytes.len() >= 2 && (bytes[1] & 0xC0) == 0x80 {
// Raw-byte overlong: 2 bytes → eight-bit character.
let raw = ((b0 & 0x01) << 6) | (bytes[1] & 0x3F);
(byte8_to_char(raw | 0x80), 2)
} else if b0 < 0xE0 && bytes.len() >= 2 && (bytes[1] & 0xC0) == 0x80 {
// 2-byte standard UTF-8
let c = ((b0 as u32 & 0x1F) << 6) | (bytes[1] as u32 & 0x3F);
(c, 2)
} else if b0 < 0xF0
&& bytes.len() >= 3
&& (bytes[1] & 0xC0) == 0x80
&& (bytes[2] & 0xC0) == 0x80
{
// 3-byte
let c =
((b0 as u32 & 0x0F) << 12) | ((bytes[1] as u32 & 0x3F) << 6) | (bytes[2] as u32 & 0x3F);
(c, 3)
} else if b0 < 0xF8
&& bytes.len() >= 4
&& (bytes[1] & 0xC0) == 0x80
&& (bytes[2] & 0xC0) == 0x80
&& (bytes[3] & 0xC0) == 0x80
{
// 4-byte
let c = ((b0 as u32 & 0x07) << 18)
| ((bytes[1] as u32 & 0x3F) << 12)
| ((bytes[2] as u32 & 0x3F) << 6)
| (bytes[3] as u32 & 0x3F);
(c, 4)
} else if b0 == 0xF8
&& bytes.len() >= 5
&& (bytes[1] & 0xC0) == 0x80
&& (bytes[2] & 0xC0) == 0x80
&& (bytes[3] & 0xC0) == 0x80
&& (bytes[4] & 0xC0) == 0x80
{
// 5-byte (F8 lead, Emacs extension)
let c = ((bytes[1] as u32 & 0x3F) << 18)
| ((bytes[2] as u32 & 0x3F) << 12)
| ((bytes[3] as u32 & 0x3F) << 6)
| (bytes[4] as u32 & 0x3F);
(c, 5)
} else {
// Invalid or truncated sequence — treat lead byte as raw byte
(byte8_to_char(b0), 1)
}
}
// ---------------------------------------------------------------------------
// Higher-level utilities
// ---------------------------------------------------------------------------
/// Count the number of characters in a multibyte byte sequence.
pub fn chars_in_multibyte(bytes: &[u8]) -> usize {
let mut count = 0usize;
let mut pos = 0usize;
while pos < bytes.len() {
let (_, len) = string_char(&bytes[pos..]);
pos += len;
count += 1;
}
count
}
/// Convert a character index to a byte offset.
///
/// Returns the byte position of the `char_idx`-th character (0-based).
/// If `char_idx` is beyond the end of the string, returns `bytes.len()`.
pub fn char_to_byte_pos(bytes: &[u8], char_idx: usize) -> usize {
let mut pos = 0usize;
let mut ci = 0usize;
while ci < char_idx && pos < bytes.len() {
let (_, len) = string_char(&bytes[pos..]);
pos += len;
ci += 1;
}
pos
}
/// Convert a byte offset to a character index.
///
/// `byte_pos` should fall on a character boundary. Returns the number of
/// characters before that byte position.
pub fn byte_to_char_pos(bytes: &[u8], byte_pos: usize) -> usize {
let mut pos = 0usize;
let mut ci = 0usize;
while pos < byte_pos && pos < bytes.len() {
let (_, len) = string_char(&bytes[pos..]);
pos += len;
ci += 1;
}
ci
}
/// If the byte sequence is valid UTF-8 (i.e. contains no raw-byte overlong
/// sequences), return it as a `&str`. Otherwise return `None`.
pub fn try_as_utf8(bytes: &[u8]) -> Option<&str> {
// A quick check: if std UTF-8 validation passes AND there are no C0/C1
// lead bytes, it is plain UTF-8. However, C0/C1 leads are always invalid
// UTF-8 anyway, so `std::str::from_utf8` already rejects them.
std::str::from_utf8(bytes).ok()
}
/// Convert an Emacs-encoded byte sequence to a UTF-8 `String`, replacing any
/// raw-byte characters with U+FFFD (REPLACEMENT CHARACTER).
pub fn to_utf8_lossy(bytes: &[u8]) -> String {
let mut out = String::new();
let mut pos = 0;
while pos < bytes.len() {
let (c, len) = string_char(&bytes[pos..]);
if char_byte8_p(c) {
out.push('\u{FFFD}');
} else if let Some(ch) = char::from_u32(c) {
out.push(ch);
} else {
out.push('\u{FFFD}');
}
pos += len;
}
out
}
/// Convert a UTF-8 string to Emacs internal encoding.
///
/// For standard Unicode this is a no-op (UTF-8 is a subset of Emacs encoding).
/// The returned `Vec<u8>` is byte-for-byte identical to the input's UTF-8
/// representation.
pub fn utf8_to_emacs(s: &str) -> Vec<u8> {
// Standard UTF-8 is already valid Emacs encoding. No transformation needed.
s.as_bytes().to_vec()
}
// ---------------------------------------------------------------------------
// Tests (in separate file per project convention)
// ---------------------------------------------------------------------------
#[cfg(test)]
#[path = "emacs_char_test.rs"]
mod tests;