Skip to main content

perl_position_tracking/
mapper.rs

1//! Centralized position mapping for correct LSP position handling
2//!
3//! Handles:
4//! - CRLF/LF/CR line endings
5//! - UTF-16 code units (LSP protocol)
6//! - Byte offsets (parser)
7//! - Efficient conversions using rope data structure
8
9use crate::WirePosition as Position;
10use ropey::Rope;
11use serde_json::Value;
12
13/// Centralized position mapper using rope for efficiency.
14///
15/// Converts between byte offsets (used by the parser) and LSP positions
16/// (line/character in UTF-16 code units) while handling mixed line endings.
17///
18/// # Examples
19///
20/// ```
21/// use perl_position_tracking::PositionMapper;
22///
23/// let text = "my $x = 1;\nmy $y = 2;\n";
24/// let mapper = PositionMapper::new(text);
25///
26/// // Convert byte offset 0 → LSP position (line 0, char 0)
27/// let pos = mapper.byte_to_lsp_pos(0);
28/// assert_eq!(pos.line, 0);
29/// assert_eq!(pos.character, 0);
30///
31/// // Second line starts at byte 11
32/// let pos = mapper.byte_to_lsp_pos(11);
33/// assert_eq!(pos.line, 1);
34/// assert_eq!(pos.character, 0);
35/// ```
36pub struct PositionMapper {
37    /// The rope containing the document text
38    rope: Rope,
39    /// Cache of line ending style
40    line_ending: LineEnding,
41}
42
43/// Line ending style detected in a document
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum LineEnding {
46    /// Unix-style line endings (LF only)
47    Lf,
48    /// Windows-style line endings (CRLF)
49    CrLf,
50    /// Classic Mac line endings (CR only)
51    Cr,
52    /// Mixed line endings detected
53    Mixed,
54}
55
56impl PositionMapper {
57    /// Create a new position mapper from text.
58    ///
59    /// Detects line endings and builds an internal rope for efficient
60    /// position conversions.
61    ///
62    /// # Examples
63    ///
64    /// ```
65    /// use perl_position_tracking::PositionMapper;
66    ///
67    /// let mapper = PositionMapper::new("print 'hello';\n");
68    /// let pos = mapper.byte_to_lsp_pos(6);
69    /// assert_eq!(pos.line, 0);
70    /// assert_eq!(pos.character, 6);
71    /// ```
72    pub fn new(text: &str) -> Self {
73        let rope = Rope::from_str(text);
74        let line_ending = detect_line_ending(text);
75        Self { rope, line_ending }
76    }
77
78    /// Update the text content
79    pub fn update(&mut self, text: &str) {
80        self.rope = Rope::from_str(text);
81        self.line_ending = detect_line_ending(text);
82    }
83
84    /// Apply an incremental edit
85    pub fn apply_edit(&mut self, start_byte: usize, end_byte: usize, new_text: &str) {
86        // Clamp to valid range
87        let start_byte = start_byte.min(self.rope.len_bytes());
88        let end_byte = end_byte.min(self.rope.len_bytes());
89
90        // Convert byte offsets to char indices (rope uses chars!)
91        let start_char = self.rope.byte_to_char(start_byte);
92        let end_char = self.rope.byte_to_char(end_byte);
93
94        // Remove old text
95        if end_char > start_char {
96            self.rope.remove(start_char..end_char);
97        }
98
99        // Insert new text
100        if !new_text.is_empty() {
101            self.rope.insert(start_char, new_text);
102        }
103
104        // Update line ending detection
105        self.line_ending = detect_line_ending(&self.rope.to_string());
106    }
107
108    /// Convert LSP position to byte offset.
109    ///
110    /// Takes a line/character position (UTF-16 code units, as specified by the
111    /// LSP protocol) and returns the corresponding byte offset in the source.
112    ///
113    /// # Examples
114    ///
115    /// ```
116    /// use perl_position_tracking::{PositionMapper, WirePosition};
117    ///
118    /// let mapper = PositionMapper::new("my $x = 1;\nmy $y = 2;\n");
119    /// // Line 1, character 3 → "$y"
120    /// let byte = mapper.lsp_pos_to_byte(WirePosition { line: 1, character: 3 });
121    /// assert_eq!(byte, Some(14));
122    /// ```
123    pub fn lsp_pos_to_byte(&self, pos: Position) -> Option<usize> {
124        let line_idx = pos.line as usize;
125        if line_idx >= self.rope.len_lines() {
126            return None;
127        }
128
129        let line_start_byte = self.rope.line_to_byte(line_idx);
130        let line = self.rope.line(line_idx);
131
132        // Convert UTF-16 code units to byte offset
133        let mut utf16_offset = 0u32;
134        let mut byte_offset = 0;
135
136        for ch in line.chars() {
137            if utf16_offset >= pos.character {
138                break;
139            }
140
141            let ch_utf16_len = if ch as u32 > 0xFFFF { 2 } else { 1 };
142            let next_utf16 = utf16_offset + ch_utf16_len;
143
144            // Clamp positions inside a surrogate pair to the start of the
145            // code point, matching `utf16_line_col_to_offset`.
146            if next_utf16 > pos.character {
147                break;
148            }
149
150            utf16_offset = next_utf16;
151            byte_offset += ch.len_utf8();
152        }
153
154        Some(line_start_byte + byte_offset)
155    }
156
157    /// Convert byte offset to LSP position.
158    ///
159    /// Returns line/character (UTF-16 code units) suitable for LSP responses.
160    ///
161    /// # Examples
162    ///
163    /// ```
164    /// use perl_position_tracking::PositionMapper;
165    ///
166    /// let mapper = PositionMapper::new("sub foo {\n    return 1;\n}\n");
167    /// let pos = mapper.byte_to_lsp_pos(14);  // points into "return"
168    /// assert_eq!(pos.line, 1);
169    /// assert_eq!(pos.character, 4);
170    /// ```
171    pub fn byte_to_lsp_pos(&self, byte_offset: usize) -> Position {
172        let byte_offset = byte_offset.min(self.rope.len_bytes());
173
174        let line_idx = self.rope.byte_to_line(byte_offset);
175        let line_start_byte = self.rope.line_to_byte(line_idx);
176        let byte_in_line = byte_offset - line_start_byte;
177
178        // Convert byte offset to UTF-16 code units
179        let line = self.rope.line(line_idx);
180        let mut utf16_offset = 0u32;
181        let mut current_byte = 0;
182
183        for ch in line.chars() {
184            if current_byte >= byte_in_line {
185                break;
186            }
187            let ch_len = ch.len_utf8();
188            if current_byte + ch_len > byte_in_line {
189                // We're in the middle of this character
190                break;
191            }
192            current_byte += ch_len;
193            let ch_utf16_len = if ch as u32 > 0xFFFF { 2 } else { 1 };
194            utf16_offset += ch_utf16_len;
195        }
196
197        Position { line: line_idx as u32, character: utf16_offset }
198    }
199
200    /// Get the text content
201    pub fn text(&self) -> String {
202        self.rope.to_string()
203    }
204
205    /// Get a slice of text
206    pub fn slice(&self, start_byte: usize, end_byte: usize) -> String {
207        let start = start_byte.min(self.rope.len_bytes());
208        let end = end_byte.min(self.rope.len_bytes());
209        self.rope.slice(self.rope.byte_to_char(start)..self.rope.byte_to_char(end)).to_string()
210    }
211
212    /// Get total byte length
213    pub fn len_bytes(&self) -> usize {
214        self.rope.len_bytes()
215    }
216
217    /// Get total number of lines
218    pub fn len_lines(&self) -> usize {
219        self.rope.len_lines()
220    }
221
222    /// Convert LSP position to char index (for rope operations)
223    pub fn lsp_pos_to_char(&self, pos: Position) -> Option<usize> {
224        self.lsp_pos_to_byte(pos).map(|byte| self.rope.byte_to_char(byte))
225    }
226
227    /// Convert char index to LSP position
228    pub fn char_to_lsp_pos(&self, char_idx: usize) -> Position {
229        let byte_offset = self.rope.char_to_byte(char_idx);
230        self.byte_to_lsp_pos(byte_offset)
231    }
232
233    /// Check if empty
234    pub fn is_empty(&self) -> bool {
235        self.rope.len_bytes() == 0
236    }
237
238    /// Get line ending style
239    pub fn line_ending(&self) -> LineEnding {
240        self.line_ending
241    }
242}
243
244/// Convert JSON LSP position to our Position type.
245///
246/// Extracts line and character fields from a JSON object.
247pub fn json_to_position(pos: &Value) -> Option<Position> {
248    Some(Position {
249        line: pos["line"].as_u64()? as u32,
250        character: pos["character"].as_u64()? as u32,
251    })
252}
253
254/// Convert Position to JSON for LSP.
255///
256/// Creates a JSON object with line and character fields.
257pub fn position_to_json(pos: Position) -> Value {
258    serde_json::json!({
259        "line": pos.line,
260        "character": pos.character
261    })
262}
263
264/// Detect the predominant line ending style
265fn detect_line_ending(text: &str) -> LineEnding {
266    let mut crlf_count = 0;
267    let mut lf_count = 0;
268    let mut cr_count = 0;
269
270    let bytes = text.as_bytes();
271    let mut i = 0;
272    while i < bytes.len() {
273        if i + 1 < bytes.len() && bytes[i] == b'\r' && bytes[i + 1] == b'\n' {
274            crlf_count += 1;
275            i += 2;
276        } else if bytes[i] == b'\n' {
277            lf_count += 1;
278            i += 1;
279        } else if bytes[i] == b'\r' {
280            cr_count += 1;
281            i += 1;
282        } else {
283            i += 1;
284        }
285    }
286
287    // Determine predominant style
288    if crlf_count > 0 && lf_count == 0 && cr_count == 0 {
289        LineEnding::CrLf
290    } else if lf_count > 0 && crlf_count == 0 && cr_count == 0 {
291        LineEnding::Lf
292    } else if cr_count > 0 && crlf_count == 0 && lf_count == 0 {
293        LineEnding::Cr
294    } else if crlf_count > 0 || lf_count > 0 || cr_count > 0 {
295        LineEnding::Mixed
296    } else {
297        LineEnding::Lf // Default
298    }
299}
300
301/// Apply UTF-8 edit to a string.
302///
303/// Replaces the byte range with the given replacement text.
304pub fn apply_edit_utf8(
305    text: &mut String,
306    start_byte: usize,
307    old_end_byte: usize,
308    replacement: &str,
309) {
310    if !text.is_char_boundary(start_byte) || !text.is_char_boundary(old_end_byte) {
311        // Safety: ensure we're at char boundaries
312        return;
313    }
314    text.replace_range(start_byte..old_end_byte, replacement);
315}
316
317/// Count newlines in text.
318///
319/// Returns the number of LF characters in the string.
320pub fn newline_count(text: &str) -> usize {
321    text.chars().filter(|&c| c == '\n').count()
322}
323
324/// Get the column (in UTF-8 bytes) of the last line.
325///
326/// Returns the byte offset from the last newline to the end of the string.
327pub fn last_line_column_utf8(text: &str) -> u32 {
328    if let Some(last_newline) = text.rfind('\n') {
329        (text.len() - last_newline - 1) as u32
330    } else {
331        text.len() as u32
332    }
333}
334
335#[cfg(test)]
336mod tests {
337    use super::*;
338
339    #[test]
340    fn test_lf_positions() {
341        let text = "line 1\nline 2\nline 3";
342        let mapper = PositionMapper::new(text);
343
344        // Start of document
345        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0));
346        assert_eq!(mapper.byte_to_lsp_pos(0), Position { line: 0, character: 0 });
347
348        // Middle of first line
349        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 3 }), Some(3));
350        assert_eq!(mapper.byte_to_lsp_pos(3), Position { line: 0, character: 3 });
351
352        // Start of second line
353        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 1, character: 0 }), Some(7));
354        assert_eq!(mapper.byte_to_lsp_pos(7), Position { line: 1, character: 0 });
355    }
356
357    #[test]
358    fn test_crlf_positions() {
359        let text = "line 1\r\nline 2\r\nline 3";
360        let mapper = PositionMapper::new(text);
361
362        assert_eq!(mapper.line_ending(), LineEnding::CrLf);
363
364        // Start of second line (after \r\n)
365        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 1, character: 0 }), Some(8));
366        assert_eq!(mapper.byte_to_lsp_pos(8), Position { line: 1, character: 0 });
367
368        // Start of third line
369        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 2, character: 0 }), Some(16));
370        assert_eq!(mapper.byte_to_lsp_pos(16), Position { line: 2, character: 0 });
371    }
372
373    #[test]
374    fn test_utf16_positions() {
375        let text = "hello 😀 world"; // Emoji is 2 UTF-16 code units
376        let mapper = PositionMapper::new(text);
377
378        // Before emoji
379        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 6 }), Some(6));
380
381        // After emoji (6 + 2 UTF-16 units = 8)
382        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 8 }), Some(10)); // 6 + 4 bytes for emoji
383
384        // Convert back
385        assert_eq!(mapper.byte_to_lsp_pos(10), Position { line: 0, character: 8 });
386    }
387
388    #[test]
389    fn test_utf16_positions_clamp_mid_surrogate_to_char_start() {
390        let text = "a😀b";
391        let mapper = PositionMapper::new(text);
392
393        // UTF-16 position 2 lands inside 😀 (which spans code units 1..3).
394        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 2 }), Some(1));
395    }
396
397    #[test]
398    fn test_utf16_surrogate_pair_boundaries() {
399        // 💖 (U+1F496) is a non-BMP char requiring a surrogate pair.
400        // Byte layout: 'x'=1 byte, '💖'=4 bytes (U+1F496), 'y'=1 byte.
401        // UTF-16 layout: 'x'=1 unit, '💖'=2 units (surrogate pair), 'y'=1 unit.
402        let text = "x💖y";
403        let mapper = PositionMapper::new(text);
404
405        // Before surrogate pair (column 0, 1)
406        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0));
407        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 1 }), Some(1));
408
409        // Mid-surrogate (column 2) — must clamp to start of emoji (byte 1),
410        // matching `utf16_line_col_to_offset` behavior.
411        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 2 }), Some(1));
412
413        // End of surrogate pair (column 3) — points just past emoji.
414        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 3 }), Some(5));
415
416        // After 'y' (column 4) — end of string.
417        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 4 }), Some(6));
418    }
419
420    #[test]
421    fn test_utf16_max_code_point() {
422        // U+10FFFF is the highest valid Unicode code point.
423        // Encoded as UTF-8 it's 4 bytes; in UTF-16 it's a surrogate pair (2 units).
424        let max_char = '\u{10FFFF}';
425        let text = format!("a{max_char}b");
426        let mapper = PositionMapper::new(&text);
427
428        // 'a' is col 0, U+10FFFF occupies cols 1..3, 'b' is col 3.
429        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0));
430        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 1 }), Some(1));
431        // Mid-surrogate clamp
432        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 2 }), Some(1));
433        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 3 }), Some(5));
434        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 4 }), Some(6));
435
436        // Round-trip the byte offsets back to positions (non-mid-surrogate).
437        assert_eq!(mapper.byte_to_lsp_pos(0), Position { line: 0, character: 0 });
438        assert_eq!(mapper.byte_to_lsp_pos(1), Position { line: 0, character: 1 });
439        assert_eq!(mapper.byte_to_lsp_pos(5), Position { line: 0, character: 3 });
440        assert_eq!(mapper.byte_to_lsp_pos(6), Position { line: 0, character: 4 });
441    }
442
443    #[test]
444    fn test_utf16_mixed_bmp_and_supplementary_plane() {
445        // é (U+00E9, BMP, 2 bytes UTF-8, 1 UTF-16 unit)
446        // 💖 (U+1F496, supplementary, 4 bytes UTF-8, 2 UTF-16 units)
447        // ñ (U+00F1, BMP, 2 bytes UTF-8, 1 UTF-16 unit)
448        // 🎉 (U+1F389, supplementary, 4 bytes UTF-8, 2 UTF-16 units)
449        let text = "aé💖ñ🎉b";
450        let mapper = PositionMapper::new(text);
451
452        // Columns:
453        //   a  = 0
454        //   é  = 1
455        //   💖 = 2..4 (surrogate pair)
456        //   ñ  = 4
457        //   🎉 = 5..7 (surrogate pair)
458        //   b  = 7
459        //   end = 8
460        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0)); // a
461        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 1 }), Some(1)); // é
462        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 2 }), Some(3)); // 💖 start
463        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 3 }), Some(3)); // mid-surrogate clamp
464        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 4 }), Some(7)); // ñ
465        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 5 }), Some(9)); // 🎉 start
466        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 6 }), Some(9)); // mid-surrogate clamp
467        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 7 }), Some(13)); // b
468        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 8 }), Some(14)); // end
469    }
470
471    #[test]
472    fn test_utf16_zero_length_input() {
473        let text = "";
474        let mapper = PositionMapper::new(text);
475
476        // An empty rope has one logical line (line 0) of length 0.
477        // Position (0, 0) should map to byte 0.
478        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0));
479        // Any character beyond 0 should clamp to byte 0 (end of empty line).
480        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 5 }), Some(0));
481
482        // Line past end of document returns None.
483        assert!(mapper.lsp_pos_to_byte(Position { line: 1, character: 0 }).is_none());
484
485        // Reverse direction: byte 0 should map to (0, 0).
486        assert_eq!(mapper.byte_to_lsp_pos(0), Position { line: 0, character: 0 });
487    }
488
489    #[test]
490    fn test_utf16_consecutive_surrogate_pairs() {
491        // Back-to-back supplementary-plane chars to ensure mid-surrogate
492        // clamping doesn't advance past the current char.
493        let text = "💖💖";
494        let mapper = PositionMapper::new(text);
495
496        // First 💖 is cols 0..2, second 💖 is cols 2..4.
497        // Bytes: first = 0..4, second = 4..8.
498        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 0 }), Some(0));
499        // Mid first surrogate pair — clamp to start of first emoji (byte 0).
500        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 1 }), Some(0));
501        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 2 }), Some(4));
502        // Mid second surrogate pair — clamp to start of second emoji (byte 4).
503        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 3 }), Some(4));
504        assert_eq!(mapper.lsp_pos_to_byte(Position { line: 0, character: 4 }), Some(8));
505    }
506
507    #[test]
508    fn test_utf16_clamp_matches_convert_helper() {
509        // Parity: PositionMapper::lsp_pos_to_byte should agree with the
510        // convert::utf16_line_col_to_offset helper at every column, including
511        // mid-surrogate positions. These are the two canonical UTF-16 -> byte
512        // converters and they must never disagree.
513        use crate::convert::utf16_line_col_to_offset;
514
515        let text = "a😀b💖c\nx💡y";
516        let mapper = PositionMapper::new(text);
517
518        // Line 0: "a😀b💖c"
519        //   a=0, 😀=1..3, b=3, 💖=4..6, c=6, end=7
520        for col in 0..=7 {
521            let mapper_byte =
522                mapper.lsp_pos_to_byte(Position { line: 0, character: col }).unwrap_or(usize::MAX);
523            let helper_byte = utf16_line_col_to_offset(text, 0, col);
524            assert_eq!(
525                mapper_byte, helper_byte,
526                "disagreement at line 0 col {col}: mapper={mapper_byte} helper={helper_byte}"
527            );
528        }
529    }
530
531    #[test]
532    fn test_mixed_line_endings() {
533        let text = "line 1\r\nline 2\nline 3\rline 4";
534        let mapper = PositionMapper::new(text);
535
536        assert_eq!(mapper.line_ending(), LineEnding::Mixed);
537
538        // Each line start
539        assert_eq!(mapper.byte_to_lsp_pos(0), Position { line: 0, character: 0 });
540        assert_eq!(mapper.byte_to_lsp_pos(8), Position { line: 1, character: 0 });
541        assert_eq!(mapper.byte_to_lsp_pos(15), Position { line: 2, character: 0 });
542        assert_eq!(mapper.byte_to_lsp_pos(22), Position { line: 3, character: 0 });
543    }
544
545    #[test]
546    fn test_incremental_edit() {
547        let mut mapper = PositionMapper::new("hello world");
548
549        // Replace "world" with "Rust"
550        mapper.apply_edit(6, 11, "Rust");
551        assert_eq!(mapper.text(), "hello Rust");
552
553        // Insert in middle
554        mapper.apply_edit(5, 5, " beautiful");
555        assert_eq!(mapper.text(), "hello beautiful Rust");
556
557        // Delete "beautiful " (keep one space)
558        mapper.apply_edit(5, 16, " ");
559        assert_eq!(mapper.text(), "hello Rust");
560    }
561}