Skip to main content

perl_position_tracking/
convert.rs

1//! UTF-8/UTF-16 position conversion functions.
2//!
3//! The helpers in this module follow Language Server Protocol (LSP) semantics,
4//! where lines and columns are zero-based and columns are measured in UTF-16
5//! code units.
6
7/// Converts a byte offset into `(line, column_utf16)` coordinates.
8///
9/// Offsets beyond the end of the document are clamped to the last valid
10/// position.
11pub fn offset_to_utf16_line_col(text: &str, offset: usize) -> (u32, u32) {
12    if offset > text.len() {
13        let lines: Vec<&str> = text.lines().collect();
14        let last_line = lines.len().saturating_sub(1) as u32;
15        let last_col = lines.last().map(|l| l.encode_utf16().count()).unwrap_or(0) as u32;
16        return (last_line, last_col);
17    }
18    if offset == text.len() && (text.ends_with('\n') || text.ends_with("\r\n")) {
19        return (text.split_inclusive('\n').count() as u32, 0);
20    }
21    let mut acc = 0usize;
22    for (line_idx, line) in text.split_inclusive('\n').enumerate() {
23        let next = acc + line.len();
24        if offset < next {
25            let rel = offset - acc;
26            if rel == 0 {
27                return (line_idx as u32, 0);
28            }
29            if rel >= line.len() {
30                return (line_idx as u32, line.encode_utf16().count() as u32);
31            }
32            if line.is_char_boundary(rel) {
33                return (line_idx as u32, line[..rel].encode_utf16().count() as u32);
34            }
35            let mut cs = rel;
36            while cs > 0 && !line.is_char_boundary(cs) {
37                cs -= 1;
38            }
39            // Clamp to the previous Unicode scalar boundary.
40            // Returning a half-surrogate UTF-16 column would violate LSP invariants.
41            return (line_idx as u32, line[..cs].encode_utf16().count() as u32);
42        }
43        acc = next;
44    }
45    let last_line = text.lines().count().saturating_sub(1) as u32;
46    (last_line, text.lines().last().map(|l| l.encode_utf16().count()).unwrap_or(0) as u32)
47}
48
49/// Converts `(line, column_utf16)` coordinates into a byte offset.
50///
51/// If the provided line or column is out of bounds, the result is clamped to
52/// the nearest valid byte position in `text`.
53pub fn utf16_line_col_to_offset(text: &str, line: u32, col: u32) -> usize {
54    let mut offset = 0;
55    for (curr, lt) in text.split_inclusive('\n').enumerate() {
56        if curr as u32 == line {
57            if col == 0 {
58                return offset;
59            }
60            let mut up = 0u32;
61            for (bi, ch) in lt.char_indices() {
62                if up == col {
63                    return offset + bi;
64                }
65                if up < col && col < up + ch.len_utf16() as u32 {
66                    return offset + bi;
67                }
68                up += ch.len_utf16() as u32;
69                if up > col {
70                    return offset + bi;
71                }
72            }
73            let lcl = if lt.ends_with('\n') { lt.len() - 1 } else { lt.len() };
74            return offset + lcl.min(text.len() - offset);
75        }
76        offset += lt.len();
77    }
78    text.len()
79}
80
81#[cfg(test)]
82mod tests {
83    use super::{offset_to_utf16_line_col, utf16_line_col_to_offset};
84
85    #[test]
86    fn offset_to_utf16_clamps_mid_codepoint_offsets_to_previous_boundary() {
87        let text = "💖z";
88
89        // Byte offset 1 sits inside the first UTF-8 codepoint (the emoji).
90        // The reported UTF-16 column must stay on a valid boundary (0 or 2).
91        assert_eq!(offset_to_utf16_line_col(text, 1), (0, 0));
92        assert_eq!(offset_to_utf16_line_col(text, 2), (0, 0));
93        assert_eq!(offset_to_utf16_line_col(text, 3), (0, 0));
94    }
95
96    #[test]
97    fn offset_to_utf16_handles_multibyte_and_surrogate_pairs() {
98        let text = "aé💖z";
99
100        assert_eq!(offset_to_utf16_line_col(text, 0), (0, 0));
101        assert_eq!(offset_to_utf16_line_col(text, 1), (0, 1));
102        assert_eq!(offset_to_utf16_line_col(text, 3), (0, 2));
103        assert_eq!(offset_to_utf16_line_col(text, 7), (0, 4));
104        assert_eq!(offset_to_utf16_line_col(text, text.len()), (0, 5));
105    }
106
107    #[test]
108    fn offset_to_utf16_clamps_out_of_bounds_to_last_position() {
109        let text = "alpha\nbeta";
110        assert_eq!(offset_to_utf16_line_col(text, text.len() + 25), (1, 4));
111    }
112
113    #[test]
114    fn offset_to_utf16_reports_new_empty_line_for_terminal_newline() {
115        let text = "one\ntwo\n";
116        assert_eq!(offset_to_utf16_line_col(text, text.len()), (2, 0));
117    }
118
119    #[test]
120    fn utf16_line_col_to_offset_handles_split_surrogate_column() {
121        let text = "x💖y";
122        assert_eq!(utf16_line_col_to_offset(text, 0, 0), 0);
123        assert_eq!(utf16_line_col_to_offset(text, 0, 1), 1);
124        assert_eq!(utf16_line_col_to_offset(text, 0, 2), 1);
125        assert_eq!(utf16_line_col_to_offset(text, 0, 3), 5);
126        assert_eq!(utf16_line_col_to_offset(text, 0, 4), 6);
127    }
128
129    #[test]
130    fn utf16_line_col_to_offset_clamps_when_column_or_line_is_too_large() {
131        let text = "abc\nw💡";
132        assert_eq!(utf16_line_col_to_offset(text, 1, 99), text.len());
133        assert_eq!(utf16_line_col_to_offset(text, 99, 0), text.len());
134    }
135
136    #[test]
137    fn utf16_helpers_handle_crlf_and_surrogates_together() {
138        let text = "a\r\nb💖c\r\n";
139
140        assert_eq!(offset_to_utf16_line_col(text, 3), (1, 0));
141        assert_eq!(offset_to_utf16_line_col(text, 8), (1, 3));
142        assert_eq!(utf16_line_col_to_offset(text, 1, 2), 4);
143        assert_eq!(utf16_line_col_to_offset(text, 1, 3), 8);
144    }
145}