php_lsp/
util.rs

1use tower_lsp::lsp_types::Position;
2
3/// Returns `true` if `query` matches `candidate` using camelCase/underscore
4/// abbreviation rules.
5///
6/// Rules (applied in order, first match wins):
7/// 1. `candidate` starts with `query` (case-insensitive prefix match).
8/// 2. Every character of `query` matches either the start of a camelCase word
9///    (uppercase letter preceded by lowercase) or the character after `_` in
10///    the candidate.
11///
12/// Examples:
13/// - `"GRF"` matches `"getRecentFiles"`
14/// - `"str_r"` matches `"str_replace"`
15/// - `"srp"` matches `"str_replace"`
16pub(crate) fn fuzzy_camel_match(query: &str, candidate: &str) -> bool {
17    if query.is_empty() {
18        return true;
19    }
20    let ql: String = query.to_lowercase();
21    let cl: String = candidate.to_lowercase();
22    // Fast path: plain prefix
23    if cl.starts_with(&ql) {
24        return true;
25    }
26    // Camel / underscore abbreviation
27    let qchars: Vec<char> = ql.chars().collect();
28    let cchars: Vec<char> = candidate.chars().collect();
29    let mut qi = 0usize;
30    let mut ci = 0usize;
31    while qi < qchars.len() && ci < cchars.len() {
32        let qc = qchars[qi];
33        // A "word boundary" in the candidate is: position 0, after '_', or
34        // an uppercase letter after a lowercase letter (camelCase transition).
35        let is_boundary = ci == 0
36            || cchars[ci - 1] == '_'
37            || (cchars[ci].is_uppercase() && ci > 0 && cchars[ci - 1].is_lowercase());
38        if is_boundary && cchars[ci].to_lowercase().next() == Some(qc) {
39            qi += 1;
40        }
41        ci += 1;
42    }
43    qi == qchars.len()
44}
45
46/// Compute a sort key for a completion item so that items matching the query
47/// by plain prefix sort before camel/underscore abbreviation matches.
48/// Lower string = higher priority.
49pub(crate) fn camel_sort_key(query: &str, label: &str) -> String {
50    let lq = query.to_lowercase();
51    let ll = label.to_lowercase();
52    if ll.starts_with(&lq) {
53        format!("0{}", ll)
54    } else {
55        format!("1{}", ll)
56    }
57}
58
59/// Return `true` if `name` is a known PHP built-in function.
60/// Used by hover to generate php.net links.
61pub(crate) fn is_php_builtin(name: &str) -> bool {
62    // Sorted for binary search.
63    const BUILTINS: &[&str] = &[
64        "abs",
65        "acos",
66        "addslashes",
67        "array_chunk",
68        "array_combine",
69        "array_diff",
70        "array_fill",
71        "array_fill_keys",
72        "array_filter",
73        "array_flip",
74        "array_intersect",
75        "array_key_exists",
76        "array_keys",
77        "array_map",
78        "array_merge",
79        "array_pad",
80        "array_pop",
81        "array_push",
82        "array_reduce",
83        "array_replace",
84        "array_reverse",
85        "array_search",
86        "array_shift",
87        "array_slice",
88        "array_splice",
89        "array_unique",
90        "array_unshift",
91        "array_values",
92        "array_walk",
93        "array_walk_recursive",
94        "arsort",
95        "asin",
96        "asort",
97        "atan",
98        "atan2",
99        "base64_decode",
100        "base64_encode",
101        "basename",
102        "boolval",
103        "call_user_func",
104        "call_user_func_array",
105        "ceil",
106        "checkdate",
107        "class_exists",
108        "closedir",
109        "compact",
110        "constant",
111        "copy",
112        "cos",
113        "date",
114        "date_add",
115        "date_create",
116        "date_diff",
117        "date_format",
118        "date_sub",
119        "define",
120        "defined",
121        "die",
122        "dirname",
123        "empty",
124        "exit",
125        "exp",
126        "explode",
127        "extract",
128        "fclose",
129        "feof",
130        "fgets",
131        "file_exists",
132        "file_get_contents",
133        "file_put_contents",
134        "floatval",
135        "floor",
136        "fmod",
137        "fopen",
138        "fputs",
139        "fread",
140        "fseek",
141        "ftell",
142        "function_exists",
143        "get_class",
144        "get_parent_class",
145        "gettype",
146        "glob",
147        "hash",
148        "header",
149        "headers_sent",
150        "htmlentities",
151        "htmlspecialchars",
152        "http_build_query",
153        "implode",
154        "in_array",
155        "intdiv",
156        "interface_exists",
157        "intval",
158        "is_a",
159        "is_array",
160        "is_bool",
161        "is_callable",
162        "is_dir",
163        "is_double",
164        "is_file",
165        "is_finite",
166        "is_float",
167        "is_infinite",
168        "is_int",
169        "is_integer",
170        "is_long",
171        "is_nan",
172        "is_null",
173        "is_numeric",
174        "is_object",
175        "is_readable",
176        "is_string",
177        "is_subclass_of",
178        "is_writable",
179        "isset",
180        "join",
181        "json_decode",
182        "json_encode",
183        "krsort",
184        "ksort",
185        "lcfirst",
186        "list",
187        "log",
188        "ltrim",
189        "max",
190        "md5",
191        "method_exists",
192        "microtime",
193        "min",
194        "mkdir",
195        "mktime",
196        "mt_rand",
197        "nl2br",
198        "number_format",
199        "ob_end_clean",
200        "ob_get_clean",
201        "ob_start",
202        "opendir",
203        "parse_str",
204        "parse_url",
205        "pathinfo",
206        "pi",
207        "pow",
208        "preg_match",
209        "preg_match_all",
210        "preg_quote",
211        "preg_replace",
212        "preg_split",
213        "print_r",
214        "printf",
215        "property_exists",
216        "rand",
217        "random_int",
218        "rawurldecode",
219        "rawurlencode",
220        "readdir",
221        "realpath",
222        "rename",
223        "rewind",
224        "rmdir",
225        "round",
226        "rsort",
227        "rtrim",
228        "scandir",
229        "serialize",
230        "session_destroy",
231        "session_start",
232        "setcookie",
233        "settype",
234        "sha1",
235        "sin",
236        "sleep",
237        "sort",
238        "sprintf",
239        "sqrt",
240        "str_contains",
241        "str_ends_with",
242        "str_pad",
243        "str_repeat",
244        "str_replace",
245        "str_split",
246        "str_starts_with",
247        "str_word_count",
248        "strcasecmp",
249        "strcmp",
250        "strip_tags",
251        "stripslashes",
252        "stristr",
253        "strlen",
254        "strncasecmp",
255        "strncmp",
256        "strpos",
257        "strrpos",
258        "strstr",
259        "strtolower",
260        "strtotime",
261        "strtoupper",
262        "strval",
263        "substr",
264        "substr_count",
265        "substr_replace",
266        "tan",
267        "time",
268        "trim",
269        "uasort",
270        "ucfirst",
271        "ucwords",
272        "uksort",
273        "unlink",
274        "unserialize",
275        "unset",
276        "urldecode",
277        "urlencode",
278        "usleep",
279        "usort",
280        "var_dump",
281        "var_export",
282        "vsprintf",
283    ];
284    debug_assert!(
285        BUILTINS.windows(2).all(|w| w[0] <= w[1]),
286        "BUILTINS must be sorted for binary_search"
287    );
288    BUILTINS.binary_search(&name).is_ok()
289}
290
291/// Build the php.net documentation URL for a built-in function name.
292pub(crate) fn php_doc_url(name: &str) -> String {
293    // php.net uses underscores replaced with dashes in the URL path.
294    let slug = name.replace('_', "-");
295    format!("https://www.php.net/function.{}", slug)
296}
297
298/// Convert a UTF-16 code unit offset into a UTF-8 byte offset for `s`.
299///
300/// LSP positions use UTF-16 code units; Rust strings are UTF-8.  This helper
301/// walks the string's `char_indices`, accumulating UTF-16 units, and returns
302/// the byte index of the character at the given UTF-16 offset.  If the offset
303/// is past the end of the string, `s.len()` is returned.
304pub(crate) fn utf16_offset_to_byte(s: &str, utf16_offset: usize) -> usize {
305    let mut utf16_count = 0usize;
306    for (byte_idx, ch) in s.char_indices() {
307        if utf16_count >= utf16_offset {
308            return byte_idx;
309        }
310        utf16_count += ch.len_utf16();
311    }
312    s.len()
313}
314
315/// Convert a UTF-8 byte offset into a UTF-16 code unit count.
316///
317/// LSP `Position.character` is measured in UTF-16 code units.  Given a string
318/// and a byte offset into it, this returns how many UTF-16 units precede that
319/// offset — which is the correct LSP character value.
320pub(crate) fn byte_to_utf16(s: &str, byte_offset: usize) -> u32 {
321    s[..byte_offset.min(s.len())]
322        .chars()
323        .map(|c| c.len_utf16() as u32)
324        .sum()
325}
326
327/// Split a parameter list string on commas, respecting bracket nesting.
328///
329/// This avoids splitting inside default values like `array $x = [1, 2, 3]`.
330/// Each returned slice is trimmed of leading/trailing whitespace.
331pub(crate) fn split_params(s: &str) -> Vec<&str> {
332    let mut parts = Vec::new();
333    let mut depth = 0i32;
334    let mut start = 0;
335    for (i, ch) in s.char_indices() {
336        match ch {
337            '(' | '[' | '{' => depth += 1,
338            ')' | ']' | '}' => depth -= 1,
339            ',' if depth == 0 => {
340                parts.push(s[start..i].trim());
341                start = i + 1;
342            }
343            _ => {}
344        }
345    }
346    let last = s[start..].trim();
347    if !last.is_empty() {
348        parts.push(last);
349    }
350    parts
351}
352
353/// Extract the word (identifier) under the cursor, handling UTF-16 offsets.
354pub(crate) fn word_at(source: &str, position: Position) -> Option<String> {
355    // Use split('\n') rather than lines() so that a trailing newline produces a
356    // final empty entry — lines() silently drops it, causing word_at to return
357    // None for any cursor on the last line of a normally-saved PHP file.
358    let raw = source.split('\n').nth(position.line as usize)?;
359    let line = raw.strip_suffix('\r').unwrap_or(raw);
360    let char_offset = position.character as usize;
361
362    let chars: Vec<char> = line.chars().collect();
363
364    let mut utf16_len = 0usize;
365    let mut char_pos = 0usize;
366    for ch in &chars {
367        if utf16_len >= char_offset {
368            break;
369        }
370        utf16_len += ch.len_utf16();
371        char_pos += 1;
372    }
373
374    let total_utf16: usize = chars.iter().map(|c| c.len_utf16()).sum();
375    if char_offset > total_utf16 {
376        return None;
377    }
378
379    let is_word = |c: char| c.is_alphanumeric() || c == '_' || c == '$' || c == '\\';
380
381    let mut left = char_pos;
382    while left > 0 && is_word(chars[left - 1]) {
383        left -= 1;
384    }
385
386    let mut right = char_pos;
387    while right < chars.len() && is_word(chars[right]) {
388        right += 1;
389    }
390
391    if left == right {
392        return None;
393    }
394
395    let word: String = chars[left..right].iter().collect();
396    if word.is_empty() { None } else { Some(word) }
397}
398
399/// Extract the source text covered by an LSP `Range`.
400///
401/// `Range` positions use UTF-16 code-unit offsets; this function converts them
402/// correctly before slicing the UTF-8 source string.
403pub(crate) fn selected_text_range(source: &str, range: tower_lsp::lsp_types::Range) -> String {
404    let lines: Vec<&str> = source.lines().collect();
405    if range.start.line == range.end.line {
406        let line = match lines.get(range.start.line as usize) {
407            Some(l) => l,
408            None => return String::new(),
409        };
410        let start = utf16_offset_to_byte(line, range.start.character as usize);
411        let end = utf16_offset_to_byte(line, range.end.character as usize);
412        line[start..end].to_string()
413    } else {
414        let mut result = String::new();
415        for i in range.start.line..=range.end.line {
416            let line = match lines.get(i as usize) {
417                Some(l) => *l,
418                None => break,
419            };
420            if i == range.start.line {
421                let start = utf16_offset_to_byte(line, range.start.character as usize);
422                result.push_str(&line[start..]);
423            } else if i == range.end.line {
424                let end = utf16_offset_to_byte(line, range.end.character as usize);
425                result.push_str(&line[..end]);
426            } else {
427                result.push_str(line);
428            }
429            if i < range.end.line {
430                result.push('\n');
431            }
432        }
433        result
434    }
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440
441    #[test]
442    fn byte_to_utf16_ascii() {
443        assert_eq!(byte_to_utf16("hello", 3), 3);
444    }
445
446    #[test]
447    fn byte_to_utf16_multibyte_bmp() {
448        // "é" is U+00E9: 2 bytes in UTF-8, 1 code unit in UTF-16.
449        let s = "café";
450        assert_eq!(byte_to_utf16(s, 0), 0);
451        assert_eq!(byte_to_utf16(s, 3), 3); // up to "caf" (all ASCII)
452        assert_eq!(byte_to_utf16(s, 5), 4); // full string (é = 2 bytes → 1 UTF-16 unit)
453    }
454
455    #[test]
456    fn byte_to_utf16_surrogate_pair() {
457        // "😀" is U+1F600: 4 bytes in UTF-8, 2 code units in UTF-16 (surrogate pair).
458        let s = "a😀b";
459        assert_eq!(byte_to_utf16(s, 1), 1); // after "a"
460        assert_eq!(byte_to_utf16(s, 5), 3); // after "a😀" (emoji = 4 bytes → 2 UTF-16 units)
461        assert_eq!(byte_to_utf16(s, 6), 4); // full string
462    }
463
464    #[test]
465    fn byte_to_utf16_past_end_clamps() {
466        assert_eq!(byte_to_utf16("hi", 100), 2);
467    }
468
469    #[test]
470    fn utf16_offset_to_byte_ascii() {
471        assert_eq!(utf16_offset_to_byte("hello", 3), 3);
472    }
473
474    #[test]
475    fn utf16_offset_to_byte_surrogate_pair() {
476        // "a😀b": UTF-16 offset 1 → byte 1 (start of emoji), offset 3 → byte 5 (after emoji)
477        let s = "a😀b";
478        assert_eq!(utf16_offset_to_byte(s, 1), 1);
479        assert_eq!(utf16_offset_to_byte(s, 3), 5);
480    }
481
482    #[test]
483    fn byte_to_utf16_and_back_roundtrip() {
484        let s = "café 😀 world";
485        for (byte_idx, _) in s.char_indices() {
486            let utf16 = byte_to_utf16(s, byte_idx) as usize;
487            assert_eq!(utf16_offset_to_byte(s, utf16), byte_idx);
488        }
489    }
490
491    #[test]
492    fn word_at_last_line_with_trailing_newline() {
493        // Editors save files with a trailing newline; lines() drops the final
494        // empty entry, making word_at return None for cursors on the last line.
495        let src = "<?php\necho strlen($x);\n";
496        let pos = Position {
497            line: 1,
498            character: 6,
499        }; // "strlen" on line 1
500        let w = word_at(src, pos);
501        assert_eq!(
502            w.as_deref(),
503            Some("strlen"),
504            "word_at must work on lines before the trailing newline"
505        );
506        // Position on the final empty line produced by the trailing newline.
507        let last_line = Position {
508            line: 2,
509            character: 0,
510        };
511        // Should return None (empty line), but must not panic.
512        let _ = word_at(src, last_line);
513    }
514
515    #[test]
516    fn word_at_crlf_line_endings() {
517        let src = "<?php\r\nfunction foo() {}\r\n";
518        let pos = Position {
519            line: 1,
520            character: 9,
521        }; // "foo"
522        let w = word_at(src, pos);
523        assert_eq!(
524            w.as_deref(),
525            Some("foo"),
526            "word_at must handle CRLF line endings"
527        );
528    }
529
530    #[test]
531    fn is_php_builtin_asin_recognized() {
532        // asin was out of order in BUILTINS, causing binary_search to miss it.
533        assert!(
534            is_php_builtin("asin"),
535            "asin must be recognised as a PHP builtin"
536        );
537        assert!(
538            is_php_builtin("atan"),
539            "atan must be recognised as a PHP builtin"
540        );
541        assert!(
542            is_php_builtin("krsort"),
543            "krsort must be recognised as a PHP builtin"
544        );
545        assert!(
546            is_php_builtin("strcasecmp"),
547            "strcasecmp must be recognised as a PHP builtin"
548        );
549        assert!(
550            is_php_builtin("strncasecmp"),
551            "strncasecmp must be recognised as a PHP builtin"
552        );
553        assert!(
554            is_php_builtin("strip_tags"),
555            "strip_tags must be recognised as a PHP builtin"
556        );
557    }
558}
php_lsp/util.rs

php_lsp/
util.rs