mcpls_core/bridge/
encoding.rs

1//! Position encoding conversion utilities.
2//!
3//! Handles conversion between MCP (1-based) and LSP (0-based) positions,
4//! as well as UTF-8/UTF-16/UTF-32 encoding conversions.
5
6use lsp_types::Position;
7
8/// Supported position encodings per LSP 3.17.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
10pub enum PositionEncoding {
11    /// UTF-8 code units.
12    #[default]
13    Utf8,
14    /// UTF-16 code units (LSP default).
15    Utf16,
16    /// UTF-32 code units (Unicode code points).
17    Utf32,
18}
19
20impl PositionEncoding {
21    /// Parse from LSP position encoding kind string.
22    #[must_use]
23    pub fn from_lsp(kind: &str) -> Option<Self> {
24        match kind {
25            "utf-8" => Some(Self::Utf8),
26            "utf-16" => Some(Self::Utf16),
27            "utf-32" => Some(Self::Utf32),
28            _ => None,
29        }
30    }
31
32    /// Convert to LSP position encoding kind string.
33    #[must_use]
34    pub const fn to_lsp(&self) -> &'static str {
35        match self {
36            Self::Utf8 => "utf-8",
37            Self::Utf16 => "utf-16",
38            Self::Utf32 => "utf-32",
39        }
40    }
41}
42
43/// Convert MCP position (1-based) to LSP position (0-based).
44///
45/// MCP tools use 1-based line and column numbers for human readability.
46/// LSP uses 0-based positions internally.
47#[must_use]
48pub const fn mcp_to_lsp_position(line: u32, character: u32) -> Position {
49    Position {
50        line: line.saturating_sub(1),
51        character: character.saturating_sub(1),
52    }
53}
54
55/// Convert LSP position (0-based) to MCP position (1-based).
56#[must_use]
57pub const fn lsp_to_mcp_position(pos: Position) -> (u32, u32) {
58    (pos.line + 1, pos.character + 1)
59}
60
61/// Position encoding converter for handling UTF-8/UTF-16/UTF-32 conversions.
62///
63/// Different LSP servers may use different character encodings. This converter
64/// handles the conversion between byte offsets and character offsets based on
65/// the negotiated encoding.
66#[derive(Debug, Clone)]
67pub struct EncodingConverter {
68    encoding: PositionEncoding,
69}
70
71#[allow(dead_code)] // Will be used when LSP client integration is complete
72impl EncodingConverter {
73    /// Create a new encoding converter with the specified encoding.
74    #[must_use]
75    pub const fn new(encoding: PositionEncoding) -> Self {
76        Self { encoding }
77    }
78
79    /// Convert byte offset to character offset in the configured encoding.
80    ///
81    /// # Errors
82    ///
83    /// Returns an error if:
84    /// - The byte offset is not on a character boundary
85    /// - The encoding is unsupported
86    #[allow(clippy::cast_possible_truncation)] // LSP positions use u32, truncation acceptable
87    pub fn byte_offset_to_character(&self, text: &str, byte_offset: usize) -> Result<u32, String> {
88        if byte_offset > text.len() {
89            let text_len = text.len();
90            return Err(format!(
91                "Byte offset {byte_offset} exceeds text length {text_len}"
92            ));
93        }
94
95        match self.encoding {
96            PositionEncoding::Utf8 => Ok(byte_offset as u32),
97            PositionEncoding::Utf16 => {
98                let utf16_units = text[..byte_offset].encode_utf16().count();
99                Ok(utf16_units as u32)
100            }
101            PositionEncoding::Utf32 => {
102                let code_points = text[..byte_offset].chars().count();
103                Ok(code_points as u32)
104            }
105        }
106    }
107
108    /// Convert character offset to byte offset in the configured encoding.
109    ///
110    /// # Errors
111    ///
112    /// Returns an error if:
113    /// - The character offset is out of bounds
114    /// - The encoding is unsupported
115    #[allow(clippy::cast_possible_truncation)] // LSP positions use u32, truncation acceptable
116    pub fn character_to_byte_offset(
117        &self,
118        text: &str,
119        character_offset: u32,
120    ) -> Result<usize, String> {
121        match self.encoding {
122            PositionEncoding::Utf8 => {
123                let byte_offset = character_offset as usize;
124                if byte_offset > text.len() {
125                    let text_len = text.len();
126                    return Err(format!(
127                        "Character offset {character_offset} exceeds text length {text_len}"
128                    ));
129                }
130                Ok(byte_offset)
131            }
132            PositionEncoding::Utf16 => {
133                let mut utf16_count = 0u32;
134                for (byte_idx, ch) in text.char_indices() {
135                    if utf16_count >= character_offset {
136                        return Ok(byte_idx);
137                    }
138                    utf16_count += ch.len_utf16() as u32;
139                }
140                if utf16_count == character_offset {
141                    Ok(text.len())
142                } else {
143                    Err(format!(
144                        "Character offset {character_offset} out of bounds (max UTF-16 units: {utf16_count})"
145                    ))
146                }
147            }
148            PositionEncoding::Utf32 => text
149                .char_indices()
150                .nth(character_offset as usize)
151                .map(|(byte_idx, _)| byte_idx)
152                .or_else(|| {
153                    if character_offset == text.chars().count() as u32 {
154                        Some(text.len())
155                    } else {
156                        None
157                    }
158                })
159                .ok_or_else(|| {
160                    let max_code_points = text.chars().count();
161                    format!(
162                        "Character offset {character_offset} out of bounds (max code points: {max_code_points})"
163                    )
164                }),
165        }
166    }
167}
168
169#[cfg(test)]
170#[allow(clippy::unwrap_used)]
171mod tests {
172    use super::*;
173
174    #[test]
175    fn test_mcp_to_lsp_position() {
176        let lsp_pos = mcp_to_lsp_position(1, 1);
177        assert_eq!(lsp_pos.line, 0);
178        assert_eq!(lsp_pos.character, 0);
179
180        let lsp_pos = mcp_to_lsp_position(10, 5);
181        assert_eq!(lsp_pos.line, 9);
182        assert_eq!(lsp_pos.character, 4);
183    }
184
185    #[test]
186    fn test_lsp_to_mcp_position() {
187        let (line, char) = lsp_to_mcp_position(Position {
188            line: 0,
189            character: 0,
190        });
191        assert_eq!(line, 1);
192        assert_eq!(char, 1);
193
194        let (line, char) = lsp_to_mcp_position(Position {
195            line: 9,
196            character: 4,
197        });
198        assert_eq!(line, 10);
199        assert_eq!(char, 5);
200    }
201
202    #[test]
203    fn test_roundtrip() {
204        for line in 1..100 {
205            for char in 1..100 {
206                let lsp_pos = mcp_to_lsp_position(line, char);
207                let (mcp_line, mcp_char) = lsp_to_mcp_position(lsp_pos);
208                assert_eq!(line, mcp_line);
209                assert_eq!(char, mcp_char);
210            }
211        }
212    }
213
214    #[test]
215    fn test_saturating_sub_zero() {
216        // Edge case: MCP position 0 should not underflow
217        let lsp_pos = mcp_to_lsp_position(0, 0);
218        assert_eq!(lsp_pos.line, 0);
219        assert_eq!(lsp_pos.character, 0);
220    }
221
222    #[test]
223    fn test_position_encoding_parsing() {
224        assert_eq!(
225            PositionEncoding::from_lsp("utf-8"),
226            Some(PositionEncoding::Utf8)
227        );
228        assert_eq!(
229            PositionEncoding::from_lsp("utf-16"),
230            Some(PositionEncoding::Utf16)
231        );
232        assert_eq!(
233            PositionEncoding::from_lsp("utf-32"),
234            Some(PositionEncoding::Utf32)
235        );
236        assert_eq!(PositionEncoding::from_lsp("invalid"), None);
237    }
238
239    #[test]
240    fn test_utf8_encoding() {
241        let converter = EncodingConverter::new(PositionEncoding::Utf8);
242        let text = "Hello, world!";
243
244        let char_offset = converter.byte_offset_to_character(text, 7).unwrap();
245        assert_eq!(char_offset, 7);
246
247        let byte_offset = converter.character_to_byte_offset(text, 7).unwrap();
248        assert_eq!(byte_offset, 7);
249    }
250
251    #[test]
252    fn test_utf16_encoding_with_emoji() {
253        let converter = EncodingConverter::new(PositionEncoding::Utf16);
254        let text = "Hello 😀 world";
255
256        let char_offset = converter.byte_offset_to_character(text, 6).unwrap();
257        assert_eq!(char_offset, 6);
258
259        let char_offset = converter.byte_offset_to_character(text, 10).unwrap();
260        assert_eq!(char_offset, 8);
261
262        let byte_offset = converter.character_to_byte_offset(text, 6).unwrap();
263        assert_eq!(byte_offset, 6);
264
265        let byte_offset = converter.character_to_byte_offset(text, 8).unwrap();
266        assert_eq!(byte_offset, 10);
267    }
268
269    #[test]
270    fn test_utf16_encoding_roundtrip() {
271        let converter = EncodingConverter::new(PositionEncoding::Utf16);
272        let text = "Hello 🌍 world!";
273
274        for byte_idx in [0, 6, 10, 11] {
275            let char_offset = converter.byte_offset_to_character(text, byte_idx).unwrap();
276            let back_to_byte = converter
277                .character_to_byte_offset(text, char_offset)
278                .unwrap();
279            assert_eq!(byte_idx, back_to_byte);
280        }
281    }
282
283    #[test]
284    fn test_utf32_encoding() {
285        let converter = EncodingConverter::new(PositionEncoding::Utf32);
286        let text = "Hello 😀 world";
287
288        let char_offset = converter.byte_offset_to_character(text, 6).unwrap();
289        assert_eq!(char_offset, 6);
290
291        let char_offset = converter.byte_offset_to_character(text, 10).unwrap();
292        assert_eq!(char_offset, 7);
293
294        let byte_offset = converter.character_to_byte_offset(text, 7).unwrap();
295        assert_eq!(byte_offset, 10);
296    }
297
298    #[test]
299    fn test_encoding_edge_cases() {
300        let converter = EncodingConverter::new(PositionEncoding::Utf8);
301
302        assert!(converter.byte_offset_to_character("test", 100).is_err());
303        assert!(converter.character_to_byte_offset("test", 100).is_err());
304
305        let end_offset = converter.byte_offset_to_character("test", 4).unwrap();
306        assert_eq!(end_offset, 4);
307    }
308}