Skip to main content

perl_symbol_cursor/
lib.rs

1//! Cursor-oriented symbol extraction for Perl source text.
2//!
3//! This microcrate focuses on a single responsibility: extracting symbol names
4//! and ranges around a cursor position.
5
6/// Symbol sigil categories used for cursor extraction.
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8pub enum CursorSymbolKind {
9    Scalar,
10    Array,
11    Hash,
12    Subroutine,
13}
14
15/// Extract a symbol and its kind from `source` at `position`.
16pub fn extract_symbol_from_source(
17    position: usize,
18    source: &str,
19) -> Option<(String, CursorSymbolKind)> {
20    let chars: Vec<char> = source.chars().collect();
21    if position >= chars.len() {
22        return None;
23    }
24
25    let (sigil, name_start) = if position > 0 {
26        match chars.get(position - 1) {
27            Some('$') => (Some(CursorSymbolKind::Scalar), position),
28            Some('@') => (Some(CursorSymbolKind::Array), position),
29            Some('%') => (Some(CursorSymbolKind::Hash), position),
30            Some('&') => (Some(CursorSymbolKind::Subroutine), position),
31            _ => (None, position),
32        }
33    } else {
34        (None, position)
35    };
36
37    let (sigil, name_start) = if sigil.is_none() && position < chars.len() {
38        match chars[position] {
39            '$' => (Some(CursorSymbolKind::Scalar), position + 1),
40            '@' => (Some(CursorSymbolKind::Array), position + 1),
41            '%' => (Some(CursorSymbolKind::Hash), position + 1),
42            '&' => (Some(CursorSymbolKind::Subroutine), position + 1),
43            _ => (sigil, name_start),
44        }
45    } else {
46        (sigil, name_start)
47    };
48
49    let mut end = name_start;
50    while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '_') {
51        end += 1;
52    }
53
54    if end > name_start {
55        let name: String = chars[name_start..end].iter().collect();
56        let kind = sigil.unwrap_or(CursorSymbolKind::Subroutine);
57        Some((name, kind))
58    } else {
59        None
60    }
61}
62
63/// Get symbol range at `position`, including a leading sigil when present.
64pub fn get_symbol_range_at_position(position: usize, source: &str) -> Option<(usize, usize)> {
65    let chars: Vec<char> = source.chars().collect();
66    if position >= chars.len() {
67        return None;
68    }
69
70    let mut start = position;
71    if start > 0 && matches!(chars[start - 1], '$' | '@' | '%' | '&') {
72        start -= 1;
73    }
74
75    let mut end = position;
76    while end < chars.len() && (chars[end].is_alphanumeric() || chars[end] == '_') {
77        end += 1;
78    }
79
80    while start < position
81        && start < chars.len()
82        && (chars[start].is_alphanumeric() || chars[start] == '_')
83    {
84        start -= 1;
85    }
86
87    Some((start, end))
88}
89
90/// Return true when `byte` is a module/name character (`[A-Za-z0-9_:]`).
91#[inline]
92pub fn is_modchar(byte: u8) -> bool {
93    byte.is_ascii_alphanumeric() || byte == b':' || byte == b'_'
94}
95
96/// Convert a UTF-16 column index to a byte offset for a single line.
97#[inline]
98pub fn byte_offset_utf16(line_text: &str, col_utf16: usize) -> usize {
99    let mut units = 0;
100    for (i, ch) in line_text.char_indices() {
101        if units == col_utf16 {
102            return i;
103        }
104        units += if ch as u32 >= 0x10000 { 2 } else { 1 };
105    }
106    line_text.len()
107}
108
109/// Extract the module/symbol token under the cursor (UTF-16 aware).
110pub fn token_under_cursor(text: &str, line: usize, col_utf16: usize) -> Option<String> {
111    let line_text = text.lines().nth(line)?;
112    let byte_pos = byte_offset_utf16(line_text, col_utf16);
113    let bytes = line_text.as_bytes();
114
115    if byte_pos >= bytes.len() {
116        return None;
117    }
118
119    let mut start = byte_pos;
120    let mut end = byte_pos;
121
122    while start > 0 && is_modchar(bytes[start - 1]) {
123        start -= 1;
124    }
125    if start > 0 && matches!(bytes[start - 1], b'$' | b'@' | b'%' | b'&' | b'*') {
126        start -= 1;
127    }
128
129    while end < bytes.len() && is_modchar(bytes[end]) {
130        end += 1;
131    }
132
133    Some(line_text[start..end].to_string())
134}
135
136/// Check if a match at `pos..pos+word_len` is bounded by non-word chars.
137pub fn is_word_boundary(text: &[u8], pos: usize, word_len: usize) -> bool {
138    if pos > 0 && is_modchar(text[pos - 1]) {
139        return false;
140    }
141
142    let end_pos = pos + word_len;
143    if end_pos < text.len() && is_modchar(text[end_pos]) {
144        return false;
145    }
146
147    true
148}
149
150#[cfg(test)]
151mod tests {
152    use super::{byte_offset_utf16, is_word_boundary, token_under_cursor};
153
154    #[test]
155    fn token_under_cursor_extracts_perl_module_token() {
156        let text = "use Demo::Worker;\n";
157        assert_eq!(token_under_cursor(text, 0, 8), Some("Demo::Worker".to_string()));
158    }
159
160    #[test]
161    fn token_under_cursor_supports_sigils() {
162        let text = "my $value = 1;\n";
163        assert_eq!(token_under_cursor(text, 0, 5), Some("$value".to_string()));
164    }
165
166    #[test]
167    fn utf16_col_to_byte_offset_handles_surrogate_pairs() {
168        let line = "A😀B";
169        assert_eq!(byte_offset_utf16(line, 0), 0);
170        assert_eq!(byte_offset_utf16(line, 1), 1);
171        assert_eq!(byte_offset_utf16(line, 3), 5);
172        assert_eq!(byte_offset_utf16(line, 4), 6);
173    }
174
175    #[test]
176    fn word_boundary_detects_embedded_word() {
177        let text = b"fooDemo::Workerbar";
178        assert!(!is_word_boundary(text, 3, "Demo::Worker".len()));
179        assert!(is_word_boundary(b" Demo::Worker ", 1, "Demo::Worker".len()));
180    }
181}