Skip to main content

php_lsp/
util.rs

1use tower_lsp::lsp_types::Position;
2
3/// Convert a UTF-16 LSP `Position` to a byte offset in `source`.
4pub(crate) fn utf16_pos_to_byte(source: &str, position: Position) -> usize {
5    let mut byte_off = 0usize;
6    for (line_idx, line) in source.split('\n').enumerate() {
7        // `split('\n')` keeps any '\r', so line.len() includes it — correct for byte accounting.
8        // Strip '\r' only when iterating chars for UTF-16 column counting.
9        let line_content = line.strip_suffix('\r').unwrap_or(line);
10        if line_idx == position.line as usize {
11            let mut col_utf16 = 0u32;
12            for ch in line_content.chars() {
13                if col_utf16 >= position.character {
14                    break;
15                }
16                col_utf16 += ch.len_utf16() as u32;
17                byte_off += ch.len_utf8();
18            }
19            return byte_off;
20        }
21        byte_off += line.len() + 1; // +1 for '\n'; line.len() includes '\r' when present
22    }
23    byte_off
24}
25
26/// Returns `true` if `query` matches `candidate` using camelCase/underscore
27/// abbreviation rules.
28///
29/// Rules (applied in order, first match wins):
30/// 1. `candidate` starts with `query` (case-insensitive prefix match).
31/// 2. Every character of `query` matches either the start of a camelCase word
32///    (uppercase letter preceded by lowercase) or the character after `_` in
33///    the candidate.
34///
35/// Examples:
36/// - `"GRF"` matches `"getRecentFiles"`
37/// - `"str_r"` matches `"str_replace"`
38/// - `"srp"` matches `"str_replace"`
39pub(crate) fn fuzzy_camel_match(query: &str, candidate: &str) -> bool {
40    if query.is_empty() {
41        return true;
42    }
43    let ql: String = query.to_lowercase();
44    let cl: String = candidate.to_lowercase();
45    // Fast path: plain prefix
46    if cl.starts_with(&ql) {
47        return true;
48    }
49    // Camel / underscore abbreviation
50    let qchars: Vec<char> = ql.chars().collect();
51    let cchars: Vec<char> = candidate.chars().collect();
52    let mut qi = 0usize;
53    let mut ci = 0usize;
54    while qi < qchars.len() && ci < cchars.len() {
55        let qc = qchars[qi];
56        // A "word boundary" in the candidate is: position 0, after '_', or
57        // an uppercase letter after a lowercase letter (camelCase transition).
58        let is_boundary = ci == 0
59            || cchars[ci - 1] == '_'
60            || (cchars[ci].is_uppercase() && ci > 0 && cchars[ci - 1].is_lowercase());
61        if is_boundary && cchars[ci].to_lowercase().next() == Some(qc) {
62            qi += 1;
63        }
64        ci += 1;
65    }
66    qi == qchars.len()
67}
68
69/// Compute a sort key for a completion item so that items matching the query
70/// by plain prefix sort before camel/underscore abbreviation matches.
71/// Lower string = higher priority.
72pub(crate) fn camel_sort_key(query: &str, label: &str) -> String {
73    let lq = query.to_lowercase();
74    let ll = label.to_lowercase();
75    if ll.starts_with(&lq) {
76        format!("0{}", ll)
77    } else {
78        format!("1{}", ll)
79    }
80}
81
82/// Return `true` if `name` is a known PHP built-in function.
83/// Used by hover to generate php.net links.
84pub(crate) fn is_php_builtin(name: &str) -> bool {
85    // Sorted for binary search.
86    const BUILTINS: &[&str] = &[
87        "abs",
88        "acos",
89        "addslashes",
90        "array_chunk",
91        "array_combine",
92        "array_diff",
93        "array_fill",
94        "array_fill_keys",
95        "array_filter",
96        "array_flip",
97        "array_intersect",
98        "array_key_exists",
99        "array_keys",
100        "array_map",
101        "array_merge",
102        "array_pad",
103        "array_pop",
104        "array_push",
105        "array_reduce",
106        "array_replace",
107        "array_reverse",
108        "array_search",
109        "array_shift",
110        "array_slice",
111        "array_splice",
112        "array_unique",
113        "array_unshift",
114        "array_values",
115        "array_walk",
116        "array_walk_recursive",
117        "arsort",
118        "asin",
119        "asort",
120        "atan",
121        "atan2",
122        "base64_decode",
123        "base64_encode",
124        "basename",
125        "boolval",
126        "call_user_func",
127        "call_user_func_array",
128        "ceil",
129        "checkdate",
130        "class_exists",
131        "closedir",
132        "compact",
133        "constant",
134        "copy",
135        "cos",
136        "date",
137        "date_add",
138        "date_create",
139        "date_diff",
140        "date_format",
141        "date_sub",
142        "define",
143        "defined",
144        "die",
145        "dirname",
146        "empty",
147        "exit",
148        "exp",
149        "explode",
150        "extract",
151        "fclose",
152        "feof",
153        "fgets",
154        "file_exists",
155        "file_get_contents",
156        "file_put_contents",
157        "floatval",
158        "floor",
159        "fmod",
160        "fopen",
161        "fputs",
162        "fread",
163        "fseek",
164        "ftell",
165        "function_exists",
166        "get_class",
167        "get_parent_class",
168        "gettype",
169        "glob",
170        "hash",
171        "header",
172        "headers_sent",
173        "htmlentities",
174        "htmlspecialchars",
175        "http_build_query",
176        "implode",
177        "in_array",
178        "intdiv",
179        "interface_exists",
180        "intval",
181        "is_a",
182        "is_array",
183        "is_bool",
184        "is_callable",
185        "is_dir",
186        "is_double",
187        "is_file",
188        "is_finite",
189        "is_float",
190        "is_infinite",
191        "is_int",
192        "is_integer",
193        "is_long",
194        "is_nan",
195        "is_null",
196        "is_numeric",
197        "is_object",
198        "is_readable",
199        "is_string",
200        "is_subclass_of",
201        "is_writable",
202        "isset",
203        "join",
204        "json_decode",
205        "json_encode",
206        "krsort",
207        "ksort",
208        "lcfirst",
209        "list",
210        "log",
211        "ltrim",
212        "max",
213        "md5",
214        "method_exists",
215        "microtime",
216        "min",
217        "mkdir",
218        "mktime",
219        "mt_rand",
220        "nl2br",
221        "number_format",
222        "ob_end_clean",
223        "ob_get_clean",
224        "ob_start",
225        "opendir",
226        "parse_str",
227        "parse_url",
228        "pathinfo",
229        "pi",
230        "pow",
231        "preg_match",
232        "preg_match_all",
233        "preg_quote",
234        "preg_replace",
235        "preg_split",
236        "print_r",
237        "printf",
238        "property_exists",
239        "rand",
240        "random_int",
241        "rawurldecode",
242        "rawurlencode",
243        "readdir",
244        "realpath",
245        "rename",
246        "rewind",
247        "rmdir",
248        "round",
249        "rsort",
250        "rtrim",
251        "scandir",
252        "serialize",
253        "session_destroy",
254        "session_start",
255        "setcookie",
256        "settype",
257        "sha1",
258        "sin",
259        "sleep",
260        "sort",
261        "sprintf",
262        "sqrt",
263        "str_contains",
264        "str_ends_with",
265        "str_pad",
266        "str_repeat",
267        "str_replace",
268        "str_split",
269        "str_starts_with",
270        "str_word_count",
271        "strcasecmp",
272        "strcmp",
273        "strip_tags",
274        "stripslashes",
275        "stristr",
276        "strlen",
277        "strncasecmp",
278        "strncmp",
279        "strpos",
280        "strrpos",
281        "strstr",
282        "strtolower",
283        "strtotime",
284        "strtoupper",
285        "strval",
286        "substr",
287        "substr_count",
288        "substr_replace",
289        "tan",
290        "time",
291        "trim",
292        "uasort",
293        "ucfirst",
294        "ucwords",
295        "uksort",
296        "unlink",
297        "unserialize",
298        "unset",
299        "urldecode",
300        "urlencode",
301        "usleep",
302        "usort",
303        "var_dump",
304        "var_export",
305        "vsprintf",
306    ];
307    debug_assert!(
308        BUILTINS.windows(2).all(|w| w[0] <= w[1]),
309        "BUILTINS must be sorted for binary_search"
310    );
311    BUILTINS.binary_search(&name).is_ok()
312}
313
314/// Build the php.net documentation URL for a built-in function name.
315pub(crate) fn php_doc_url(name: &str) -> String {
316    // php.net uses underscores replaced with dashes in the URL path.
317    let slug = name.replace('_', "-");
318    format!("https://www.php.net/function.{}", slug)
319}
320
321/// Convert a UTF-16 code unit offset into a UTF-8 byte offset for `s`.
322///
323/// LSP positions use UTF-16 code units; Rust strings are UTF-8.  This helper
324/// walks the string's `char_indices`, accumulating UTF-16 units, and returns
325/// the byte index of the character at the given UTF-16 offset.  If the offset
326/// is past the end of the string, `s.len()` is returned.
327pub(crate) fn utf16_offset_to_byte(s: &str, utf16_offset: usize) -> usize {
328    let mut utf16_count = 0usize;
329    for (byte_idx, ch) in s.char_indices() {
330        if utf16_count >= utf16_offset {
331            return byte_idx;
332        }
333        utf16_count += ch.len_utf16();
334    }
335    s.len()
336}
337
338/// Convert a UTF-8 byte offset into a UTF-16 code unit count.
339///
340/// LSP `Position.character` is measured in UTF-16 code units.  Given a string
341/// and a byte offset into it, this returns how many UTF-16 units precede that
342/// offset — which is the correct LSP character value.
343pub(crate) fn byte_to_utf16(s: &str, byte_offset: usize) -> u32 {
344    s[..byte_offset.min(s.len())]
345        .chars()
346        .map(|c| c.len_utf16() as u32)
347        .sum()
348}
349
350/// Split a parameter list string on commas, respecting bracket nesting.
351///
352/// This avoids splitting inside default values like `array $x = [1, 2, 3]`.
353/// Each returned slice is trimmed of leading/trailing whitespace.
354pub(crate) fn split_params(s: &str) -> Vec<&str> {
355    let mut parts = Vec::new();
356    let mut depth = 0i32;
357    let mut start = 0;
358    for (i, ch) in s.char_indices() {
359        match ch {
360            '(' | '[' | '{' => depth += 1,
361            ')' | ']' | '}' => depth -= 1,
362            ',' if depth == 0 => {
363                parts.push(s[start..i].trim());
364                start = i + 1;
365            }
366            _ => {}
367        }
368    }
369    let last = s[start..].trim();
370    if !last.is_empty() {
371        parts.push(last);
372    }
373    parts
374}
375
376/// Extract the word (identifier) under the cursor, handling UTF-16 offsets.
377pub(crate) fn word_at(source: &str, position: Position) -> Option<String> {
378    // Use split('\n') rather than lines() so that a trailing newline produces a
379    // final empty entry — lines() silently drops it, causing word_at to return
380    // None for any cursor on the last line of a normally-saved PHP file.
381    let raw = source.split('\n').nth(position.line as usize)?;
382    let line = raw.strip_suffix('\r').unwrap_or(raw);
383    let char_offset = position.character as usize;
384
385    let chars: Vec<char> = line.chars().collect();
386
387    let mut utf16_len = 0usize;
388    let mut char_pos = 0usize;
389    for ch in &chars {
390        if utf16_len >= char_offset {
391            break;
392        }
393        utf16_len += ch.len_utf16();
394        char_pos += 1;
395    }
396
397    let total_utf16: usize = chars.iter().map(|c| c.len_utf16()).sum();
398    if char_offset > total_utf16 {
399        return None;
400    }
401
402    let is_word = |c: char| c.is_alphanumeric() || c == '_' || c == '$' || c == '\\';
403
404    let mut left = char_pos;
405    while left > 0 && is_word(chars[left - 1]) {
406        left -= 1;
407    }
408
409    let mut right = char_pos;
410    while right < chars.len() && is_word(chars[right]) {
411        right += 1;
412    }
413
414    if left == right {
415        return None;
416    }
417
418    let word: String = chars[left..right].iter().collect();
419    if word.is_empty() { None } else { Some(word) }
420}
421
422/// Extract the source text covered by an LSP `Range`.
423///
424/// `Range` positions use UTF-16 code-unit offsets; this function converts them
425/// correctly before slicing the UTF-8 source string.
426pub(crate) fn selected_text_range(source: &str, range: tower_lsp::lsp_types::Range) -> String {
427    let lines: Vec<&str> = source.lines().collect();
428    if range.start.line == range.end.line {
429        let line = match lines.get(range.start.line as usize) {
430            Some(l) => l,
431            None => return String::new(),
432        };
433        let start = utf16_offset_to_byte(line, range.start.character as usize);
434        let end = utf16_offset_to_byte(line, range.end.character as usize);
435        line[start..end].to_string()
436    } else {
437        let mut result = String::new();
438        for i in range.start.line..=range.end.line {
439            let line = match lines.get(i as usize) {
440                Some(l) => *l,
441                None => break,
442            };
443            if i == range.start.line {
444                let start = utf16_offset_to_byte(line, range.start.character as usize);
445                result.push_str(&line[start..]);
446            } else if i == range.end.line {
447                let end = utf16_offset_to_byte(line, range.end.character as usize);
448                result.push_str(&line[..end]);
449            } else {
450                result.push_str(line);
451            }
452            if i < range.end.line {
453                result.push('\n');
454            }
455        }
456        result
457    }
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463
464    #[test]
465    fn byte_to_utf16_ascii() {
466        assert_eq!(byte_to_utf16("hello", 3), 3);
467    }
468
469    #[test]
470    fn byte_to_utf16_multibyte_bmp() {
471        // "é" is U+00E9: 2 bytes in UTF-8, 1 code unit in UTF-16.
472        let s = "café";
473        assert_eq!(byte_to_utf16(s, 0), 0);
474        assert_eq!(byte_to_utf16(s, 3), 3); // up to "caf" (all ASCII)
475        assert_eq!(byte_to_utf16(s, 5), 4); // full string (é = 2 bytes → 1 UTF-16 unit)
476    }
477
478    #[test]
479    fn byte_to_utf16_surrogate_pair() {
480        // "😀" is U+1F600: 4 bytes in UTF-8, 2 code units in UTF-16 (surrogate pair).
481        let s = "a😀b";
482        assert_eq!(byte_to_utf16(s, 1), 1); // after "a"
483        assert_eq!(byte_to_utf16(s, 5), 3); // after "a😀" (emoji = 4 bytes → 2 UTF-16 units)
484        assert_eq!(byte_to_utf16(s, 6), 4); // full string
485    }
486
487    #[test]
488    fn byte_to_utf16_past_end_clamps() {
489        assert_eq!(byte_to_utf16("hi", 100), 2);
490    }
491
492    #[test]
493    fn utf16_offset_to_byte_ascii() {
494        assert_eq!(utf16_offset_to_byte("hello", 3), 3);
495    }
496
497    #[test]
498    fn utf16_offset_to_byte_surrogate_pair() {
499        // "a😀b": UTF-16 offset 1 → byte 1 (start of emoji), offset 3 → byte 5 (after emoji)
500        let s = "a😀b";
501        assert_eq!(utf16_offset_to_byte(s, 1), 1);
502        assert_eq!(utf16_offset_to_byte(s, 3), 5);
503    }
504
505    #[test]
506    fn byte_to_utf16_and_back_roundtrip() {
507        let s = "café 😀 world";
508        for (byte_idx, _) in s.char_indices() {
509            let utf16 = byte_to_utf16(s, byte_idx) as usize;
510            assert_eq!(utf16_offset_to_byte(s, utf16), byte_idx);
511        }
512    }
513
514    #[test]
515    fn word_at_last_line_with_trailing_newline() {
516        // Editors save files with a trailing newline; lines() drops the final
517        // empty entry, making word_at return None for cursors on the last line.
518        let src = "<?php\necho strlen($x);\n";
519        let pos = Position {
520            line: 1,
521            character: 6,
522        }; // "strlen" on line 1
523        let w = word_at(src, pos);
524        assert_eq!(
525            w.as_deref(),
526            Some("strlen"),
527            "word_at must work on lines before the trailing newline"
528        );
529        // Position on the final empty line produced by the trailing newline.
530        let last_line = Position {
531            line: 2,
532            character: 0,
533        };
534        // Should return None (empty line), but must not panic.
535        let _ = word_at(src, last_line);
536    }
537
538    #[test]
539    fn word_at_crlf_line_endings() {
540        let src = "<?php\r\nfunction foo() {}\r\n";
541        let pos = Position {
542            line: 1,
543            character: 9,
544        }; // "foo"
545        let w = word_at(src, pos);
546        assert_eq!(
547            w.as_deref(),
548            Some("foo"),
549            "word_at must handle CRLF line endings"
550        );
551    }
552
553    #[test]
554    fn is_php_builtin_asin_recognized() {
555        // asin was out of order in BUILTINS, causing binary_search to miss it.
556        assert!(
557            is_php_builtin("asin"),
558            "asin must be recognised as a PHP builtin"
559        );
560        assert!(
561            is_php_builtin("atan"),
562            "atan must be recognised as a PHP builtin"
563        );
564        assert!(
565            is_php_builtin("krsort"),
566            "krsort must be recognised as a PHP builtin"
567        );
568        assert!(
569            is_php_builtin("strcasecmp"),
570            "strcasecmp must be recognised as a PHP builtin"
571        );
572        assert!(
573            is_php_builtin("strncasecmp"),
574            "strncasecmp must be recognised as a PHP builtin"
575        );
576        assert!(
577            is_php_builtin("strip_tags"),
578            "strip_tags must be recognised as a PHP builtin"
579        );
580    }
581}