Skip to main content

rumdl_lib/utils/
utf8_offsets.rs

1//! UTF-8 byte offset to character offset conversion utilities.
2//!
3//! JavaScript uses UTF-16 code units for string indexing, while Rust uses bytes.
4//! This module provides functions to convert between byte offsets and character
5//! offsets for proper interoperability with JavaScript/WASM environments.
6
7/// Convert a byte offset to a character offset in a UTF-8 string.
8///
9/// JavaScript uses UTF-16 code units for string indexing, while Rust uses bytes.
10/// For most characters this is the same, but multi-byte UTF-8 characters
11/// (like `æ` = 2 bytes, emoji = 4 bytes) need conversion.
12///
13/// # Arguments
14/// * `content` - The UTF-8 string
15/// * `byte_offset` - The byte offset to convert
16///
17/// # Returns
18/// The corresponding character offset
19///
20/// # Examples
21/// ```
22/// use rumdl::utils::utf8_offsets::byte_offset_to_char_offset;
23///
24/// // ASCII: bytes == characters
25/// assert_eq!(byte_offset_to_char_offset("Hello", 5), 5);
26///
27/// // Norwegian æ is 2 bytes in UTF-8, 1 character
28/// assert_eq!(byte_offset_to_char_offset("æ", 2), 1);
29///
30/// // Mixed content
31/// let content = "Hello æ"; // 6 bytes + 2 bytes = 8 bytes, 7 characters
32/// assert_eq!(byte_offset_to_char_offset(content, 8), 7);
33/// ```
34/// Convert a 1-indexed byte column to a 1-indexed character column within a line.
35///
36/// This is used to convert column positions in warnings from byte offsets
37/// to character offsets for JavaScript compatibility.
38///
39/// # Arguments
40/// * `line_content` - The content of the specific line
41/// * `byte_column` - The 1-indexed byte column within the line
42///
43/// # Returns
44/// The corresponding 1-indexed character column
45pub fn byte_column_to_char_column(line_content: &str, byte_column: usize) -> usize {
46    if byte_column <= 1 {
47        return 1;
48    }
49
50    // Convert to 0-indexed byte offset
51    let byte_offset = byte_column - 1;
52
53    // Convert byte offset to character offset
54    let char_offset = byte_offset_to_char_offset(line_content, byte_offset);
55
56    // Convert back to 1-indexed
57    char_offset + 1
58}
59
60/// Get the content of a specific line (1-indexed) from the full content.
61pub fn get_line_content(content: &str, line_number: usize) -> Option<&str> {
62    if line_number == 0 {
63        return None;
64    }
65    content.lines().nth(line_number - 1)
66}
67
68pub fn byte_offset_to_char_offset(content: &str, byte_offset: usize) -> usize {
69    // Handle edge cases
70    if byte_offset == 0 {
71        return 0;
72    }
73
74    if byte_offset >= content.len() {
75        return content.chars().count();
76    }
77
78    // Count characters up to the byte offset
79    content
80        .char_indices()
81        .take_while(|(byte_idx, _)| *byte_idx < byte_offset)
82        .count()
83}
84
85#[cfg(test)]
86mod tests {
87    use super::*;
88
89    #[test]
90    fn test_byte_offset_to_char_offset_empty() {
91        assert_eq!(byte_offset_to_char_offset("", 0), 0);
92        assert_eq!(byte_offset_to_char_offset("", 1), 0);
93    }
94
95    #[test]
96    fn test_byte_offset_to_char_offset_ascii() {
97        // ASCII-only string: bytes == characters
98        let content = "Hello World";
99        assert_eq!(byte_offset_to_char_offset(content, 0), 0);
100        assert_eq!(byte_offset_to_char_offset(content, 5), 5);
101        assert_eq!(byte_offset_to_char_offset(content, 11), 11);
102        // Beyond end
103        assert_eq!(byte_offset_to_char_offset(content, 100), 11);
104    }
105
106    #[test]
107    fn test_byte_offset_to_char_offset_norwegian() {
108        // Norwegian æ is 2 bytes in UTF-8
109        let content = "æ"; // 2 bytes, 1 character
110        assert_eq!(content.len(), 2); // 2 bytes
111        assert_eq!(content.chars().count(), 1); // 1 character
112        assert_eq!(byte_offset_to_char_offset(content, 0), 0);
113        assert_eq!(byte_offset_to_char_offset(content, 2), 1); // End of string
114    }
115
116    #[test]
117    fn test_byte_offset_to_char_offset_mixed() {
118        // Mixed ASCII and multi-byte: "Hello æ world"
119        let content = "Hello æ world";
120        // Bytes: H(1) e(1) l(1) l(1) o(1) ' '(1) æ(2) ' '(1) w(1) o(1) r(1) l(1) d(1) = 14 bytes
121        // Chars: H   e    l    l    o    ' '   æ    ' '   w    o    r    l    d    = 13 chars
122        assert_eq!(content.len(), 14); // 14 bytes
123        assert_eq!(content.chars().count(), 13); // 13 characters
124
125        // Before æ
126        assert_eq!(byte_offset_to_char_offset(content, 6), 6); // Space before æ
127        // After æ (byte 8 = char 7)
128        assert_eq!(byte_offset_to_char_offset(content, 8), 7); // Space after æ
129        // End of string (byte 14 = char 13)
130        assert_eq!(byte_offset_to_char_offset(content, 14), 13);
131    }
132
133    #[test]
134    fn test_byte_offset_to_char_offset_emoji() {
135        // Emoji is 4 bytes in UTF-8
136        let content = "Hi 👋"; // "Hi " (3 bytes) + wave (4 bytes) = 7 bytes, 4 chars
137        assert_eq!(content.len(), 7);
138        assert_eq!(content.chars().count(), 4);
139        assert_eq!(byte_offset_to_char_offset(content, 3), 3); // Before emoji
140        assert_eq!(byte_offset_to_char_offset(content, 7), 4); // End of string
141    }
142
143    #[test]
144    fn test_byte_offset_to_char_offset_norwegian_sentence() {
145        // This is the exact bug case: Norwegian letter at end of file
146        let content = "# Heading\n\nContent with Norwegian letter \"æ\".";
147        assert_eq!(content.len(), 46); // 46 bytes (æ is 2 bytes)
148        assert_eq!(content.chars().count(), 45); // 45 characters (æ is 1 char)
149
150        // End of file: byte offset 46 should convert to character offset 45
151        assert_eq!(byte_offset_to_char_offset(content, 46), 45);
152    }
153
154    #[test]
155    fn test_byte_offset_to_char_offset_multiple_multibyte() {
156        // String with multiple multi-byte characters
157        let content = "café résumé"; // c(1) a(1) f(1) é(2) ' '(1) r(1) é(2) s(1) u(1) m(1) é(2) = 14 bytes, 11 chars
158        assert_eq!(content.len(), 14);
159        assert_eq!(content.chars().count(), 11);
160
161        assert_eq!(byte_offset_to_char_offset(content, 0), 0);
162        assert_eq!(byte_offset_to_char_offset(content, 3), 3); // Before first é
163        assert_eq!(byte_offset_to_char_offset(content, 5), 4); // After first é
164        assert_eq!(byte_offset_to_char_offset(content, 14), 11); // End
165    }
166
167    #[test]
168    fn test_byte_column_to_char_column() {
169        // Line with Norwegian æ
170        let line = "Content with Norwegian letter \"æ\".";
171        // Bytes: 35 (æ is 2 bytes)
172        // Chars: 34 (æ is 1 char)
173        assert_eq!(line.len(), 35);
174        assert_eq!(line.chars().count(), 34);
175
176        // Column 1 stays 1
177        assert_eq!(byte_column_to_char_column(line, 1), 1);
178
179        // Before æ: columns are the same (all ASCII so far)
180        assert_eq!(byte_column_to_char_column(line, 30), 30);
181
182        // At æ position: byte column 32 = char column 32 (æ is at char index 31, column 32)
183        assert_eq!(byte_column_to_char_column(line, 32), 32);
184
185        // After æ: byte column 34 = char column 33 (quote after æ is at char index 32)
186        assert_eq!(byte_column_to_char_column(line, 34), 33);
187
188        // End of line: byte column 36 = char column 35 (1 past end)
189        assert_eq!(byte_column_to_char_column(line, 36), 35);
190    }
191
192    #[test]
193    fn test_byte_column_to_char_column_edge_cases() {
194        // Empty string
195        assert_eq!(byte_column_to_char_column("", 1), 1);
196        assert_eq!(byte_column_to_char_column("", 0), 1);
197
198        // ASCII only - no conversion needed
199        let ascii = "Hello World";
200        assert_eq!(byte_column_to_char_column(ascii, 1), 1);
201        assert_eq!(byte_column_to_char_column(ascii, 6), 6);
202        assert_eq!(byte_column_to_char_column(ascii, 12), 12); // Past end
203
204        // Multiple multi-byte characters in sequence
205        let multi = "æøå"; // 6 bytes, 3 chars
206        assert_eq!(multi.len(), 6);
207        assert_eq!(multi.chars().count(), 3);
208        assert_eq!(byte_column_to_char_column(multi, 1), 1); // Start of æ
209        assert_eq!(byte_column_to_char_column(multi, 3), 2); // Start of ø
210        assert_eq!(byte_column_to_char_column(multi, 5), 3); // Start of å
211        assert_eq!(byte_column_to_char_column(multi, 7), 4); // Past end
212
213        // Emoji (4 bytes)
214        let emoji = "Hi 👋!"; // 3 + 4 + 1 = 8 bytes, 5 chars
215        assert_eq!(emoji.len(), 8);
216        assert_eq!(emoji.chars().count(), 5);
217        assert_eq!(byte_column_to_char_column(emoji, 4), 4); // Start of emoji
218        assert_eq!(byte_column_to_char_column(emoji, 8), 5); // The "!"
219        assert_eq!(byte_column_to_char_column(emoji, 9), 6); // Past end
220
221        // Line with only multi-byte characters
222        let only_multi = "日本語"; // 9 bytes (3 chars × 3 bytes each)
223        assert_eq!(only_multi.len(), 9);
224        assert_eq!(only_multi.chars().count(), 3);
225        assert_eq!(byte_column_to_char_column(only_multi, 1), 1);
226        assert_eq!(byte_column_to_char_column(only_multi, 4), 2);
227        assert_eq!(byte_column_to_char_column(only_multi, 7), 3);
228        assert_eq!(byte_column_to_char_column(only_multi, 10), 4);
229    }
230
231    #[test]
232    fn test_byte_column_to_char_column_bug_scenario() {
233        // This tests the exact scenario from issue #4:
234        // A warning at the end of a line containing Norwegian letter æ
235        // MD047 reports column 36 (byte-based) which should be column 35 (char-based)
236        let line = "Content with Norwegian letter \"æ\".";
237
238        // The byte position after the last character (the period)
239        // Byte offset: 35 (0-indexed: 34), so byte column 36
240        // Char offset: 34 (0-indexed: 33), so char column 35
241        let byte_column_at_end = line.len() + 1; // 36
242        let expected_char_column = line.chars().count() + 1; // 35
243
244        assert_eq!(
245            byte_column_to_char_column(line, byte_column_at_end),
246            expected_char_column,
247            "End-of-line column should be converted from byte {byte_column_at_end} to char {expected_char_column}"
248        );
249
250        // Also verify that when combined with line.from, we get the correct position
251        // In the full document "# Heading\n\nContent with Norwegian letter \"æ\"."
252        // Line 3 starts at character position 11 (after "# Heading\n\n")
253        // The fix should apply at position 45 (11 + 34), not 46 (11 + 35)
254        let line_from = 11_usize;
255        let from_position = line_from + (expected_char_column - 1);
256        assert_eq!(from_position, 45, "Fix position should be 45, not 46");
257    }
258
259    #[test]
260    fn test_get_line_content() {
261        let content = "# Heading\n\nContent with Norwegian letter \"æ\".";
262
263        assert_eq!(get_line_content(content, 1), Some("# Heading"));
264        assert_eq!(get_line_content(content, 2), Some(""));
265        assert_eq!(
266            get_line_content(content, 3),
267            Some("Content with Norwegian letter \"æ\".")
268        );
269        assert_eq!(get_line_content(content, 4), None);
270        assert_eq!(get_line_content(content, 0), None);
271    }
272
273    #[test]
274    fn test_get_line_content_edge_cases() {
275        // Empty content
276        assert_eq!(get_line_content("", 1), None);
277        assert_eq!(get_line_content("", 0), None);
278
279        // Single line without newline
280        assert_eq!(get_line_content("Hello", 1), Some("Hello"));
281        assert_eq!(get_line_content("Hello", 2), None);
282
283        // Multiple empty lines
284        let content = "\n\n\n";
285        assert_eq!(get_line_content(content, 1), Some(""));
286        assert_eq!(get_line_content(content, 2), Some(""));
287        assert_eq!(get_line_content(content, 3), Some(""));
288        assert_eq!(get_line_content(content, 4), None);
289
290        // Lines with various multi-byte characters
291        let content = "Line 1\næøå\n日本語\n👋🎉";
292        assert_eq!(get_line_content(content, 1), Some("Line 1"));
293        assert_eq!(get_line_content(content, 2), Some("æøå"));
294        assert_eq!(get_line_content(content, 3), Some("日本語"));
295        assert_eq!(get_line_content(content, 4), Some("👋🎉"));
296        assert_eq!(get_line_content(content, 5), None);
297    }
298}