php_lsp/
util.rs

1use tower_lsp::lsp_types::{Position, Range};
2
3/// Returns `true` if `query` matches `candidate` using camelCase/underscore
4/// abbreviation rules.
5///
6/// Rules (applied in order, first match wins):
7/// 1. `candidate` starts with `query` (case-insensitive prefix match).
8/// 2. Every character of `query` matches either the start of a camelCase word
9///    (uppercase letter preceded by lowercase) or the character after `_` in
10///    the candidate.
11///
12/// Examples:
13/// - `"GRF"` matches `"getRecentFiles"`
14/// - `"str_r"` matches `"str_replace"`
15/// - `"srp"` matches `"str_replace"`
16pub(crate) fn fuzzy_camel_match(query: &str, candidate: &str) -> bool {
17    if query.is_empty() {
18        return true;
19    }
20    let ql: String = query.to_lowercase();
21    let cl: String = candidate.to_lowercase();
22    // Fast path: plain prefix
23    if cl.starts_with(&ql) {
24        return true;
25    }
26    // Camel / underscore abbreviation
27    let qchars: Vec<char> = ql.chars().collect();
28    let cchars: Vec<char> = candidate.chars().collect();
29    let mut qi = 0usize;
30    let mut ci = 0usize;
31    while qi < qchars.len() && ci < cchars.len() {
32        let qc = qchars[qi];
33        // A "word boundary" in the candidate is: position 0, after '_', or
34        // an uppercase letter after a lowercase letter (camelCase transition).
35        let is_boundary = ci == 0
36            || cchars[ci - 1] == '_'
37            || (cchars[ci].is_uppercase() && ci > 0 && cchars[ci - 1].is_lowercase());
38        if is_boundary && cchars[ci].to_lowercase().next() == Some(qc) {
39            qi += 1;
40        }
41        ci += 1;
42    }
43    qi == qchars.len()
44}
45
46/// Compute a sort key for a completion item so that items matching the query
47/// by plain prefix sort before camel/underscore abbreviation matches.
48/// Lower string = higher priority.
49pub(crate) fn camel_sort_key(query: &str, label: &str) -> String {
50    let lq = query.to_lowercase();
51    let ll = label.to_lowercase();
52    if ll.starts_with(&lq) {
53        format!("0{}", ll)
54    } else {
55        format!("1{}", ll)
56    }
57}
58
59/// Return `true` if `name` is a known PHP built-in function.
60/// Used by hover to generate php.net links.
61pub(crate) fn is_php_builtin(name: &str) -> bool {
62    // Sorted for binary search.
63    const BUILTINS: &[&str] = &[
64        "abs",
65        "acos",
66        "addslashes",
67        "array_chunk",
68        "array_combine",
69        "array_diff",
70        "array_fill",
71        "array_fill_keys",
72        "array_filter",
73        "array_flip",
74        "array_intersect",
75        "array_key_exists",
76        "array_keys",
77        "array_map",
78        "array_merge",
79        "array_pad",
80        "array_pop",
81        "array_push",
82        "array_reduce",
83        "array_replace",
84        "array_reverse",
85        "array_search",
86        "array_shift",
87        "array_slice",
88        "array_splice",
89        "array_unique",
90        "array_unshift",
91        "array_values",
92        "array_walk",
93        "array_walk_recursive",
94        "arsort",
95        "asin",
96        "asort",
97        "atan",
98        "atan2",
99        "base64_decode",
100        "base64_encode",
101        "basename",
102        "boolval",
103        "call_user_func",
104        "call_user_func_array",
105        "ceil",
106        "checkdate",
107        "class_exists",
108        "closedir",
109        "compact",
110        "constant",
111        "copy",
112        "cos",
113        "date",
114        "date_add",
115        "date_create",
116        "date_diff",
117        "date_format",
118        "date_sub",
119        "define",
120        "defined",
121        "die",
122        "dirname",
123        "empty",
124        "exit",
125        "exp",
126        "explode",
127        "extract",
128        "fclose",
129        "feof",
130        "fgets",
131        "file_exists",
132        "file_get_contents",
133        "file_put_contents",
134        "floatval",
135        "floor",
136        "fmod",
137        "fopen",
138        "fputs",
139        "fread",
140        "fseek",
141        "ftell",
142        "function_exists",
143        "get_class",
144        "get_parent_class",
145        "gettype",
146        "glob",
147        "hash",
148        "header",
149        "headers_sent",
150        "htmlentities",
151        "htmlspecialchars",
152        "http_build_query",
153        "implode",
154        "in_array",
155        "intdiv",
156        "interface_exists",
157        "intval",
158        "is_a",
159        "is_array",
160        "is_bool",
161        "is_callable",
162        "is_dir",
163        "is_double",
164        "is_file",
165        "is_finite",
166        "is_float",
167        "is_infinite",
168        "is_int",
169        "is_integer",
170        "is_long",
171        "is_nan",
172        "is_null",
173        "is_numeric",
174        "is_object",
175        "is_readable",
176        "is_string",
177        "is_subclass_of",
178        "is_writable",
179        "isset",
180        "join",
181        "json_decode",
182        "json_encode",
183        "krsort",
184        "ksort",
185        "lcfirst",
186        "list",
187        "log",
188        "ltrim",
189        "max",
190        "md5",
191        "method_exists",
192        "microtime",
193        "min",
194        "mkdir",
195        "mktime",
196        "mt_rand",
197        "nl2br",
198        "number_format",
199        "ob_end_clean",
200        "ob_get_clean",
201        "ob_start",
202        "opendir",
203        "parse_str",
204        "parse_url",
205        "pathinfo",
206        "pi",
207        "pow",
208        "preg_match",
209        "preg_match_all",
210        "preg_quote",
211        "preg_replace",
212        "preg_split",
213        "print_r",
214        "printf",
215        "property_exists",
216        "rand",
217        "random_int",
218        "rawurldecode",
219        "rawurlencode",
220        "readdir",
221        "realpath",
222        "rename",
223        "rewind",
224        "rmdir",
225        "round",
226        "rsort",
227        "rtrim",
228        "scandir",
229        "serialize",
230        "session_destroy",
231        "session_start",
232        "setcookie",
233        "settype",
234        "sha1",
235        "sin",
236        "sleep",
237        "sort",
238        "sprintf",
239        "sqrt",
240        "str_contains",
241        "str_ends_with",
242        "str_pad",
243        "str_repeat",
244        "str_replace",
245        "str_split",
246        "str_starts_with",
247        "str_word_count",
248        "strcasecmp",
249        "strcmp",
250        "strip_tags",
251        "stripslashes",
252        "stristr",
253        "strlen",
254        "strncasecmp",
255        "strncmp",
256        "strpos",
257        "strrpos",
258        "strstr",
259        "strtolower",
260        "strtotime",
261        "strtoupper",
262        "strval",
263        "substr",
264        "substr_count",
265        "substr_replace",
266        "tan",
267        "time",
268        "trim",
269        "uasort",
270        "ucfirst",
271        "ucwords",
272        "uksort",
273        "unlink",
274        "unserialize",
275        "unset",
276        "urldecode",
277        "urlencode",
278        "usleep",
279        "usort",
280        "var_dump",
281        "var_export",
282        "vsprintf",
283    ];
284    debug_assert!(
285        BUILTINS.windows(2).all(|w| w[0] <= w[1]),
286        "BUILTINS must be sorted for binary_search"
287    );
288    BUILTINS.binary_search(&name).is_ok()
289}
290
291/// Build the php.net documentation URL for a built-in function name.
292pub(crate) fn php_doc_url(name: &str) -> String {
293    // php.net uses underscores replaced with dashes in the URL path.
294    let slug = name.replace('_', "-");
295    format!("https://www.php.net/function.{}", slug)
296}
297
298/// Convert a UTF-16 code unit offset into a UTF-8 byte offset for `s`.
299///
300/// LSP positions use UTF-16 code units; Rust strings are UTF-8.  This helper
301/// walks the string's `char_indices`, accumulating UTF-16 units, and returns
302/// the byte index of the character at the given UTF-16 offset.  If the offset
303/// is past the end of the string, `s.len()` is returned.
304pub(crate) fn utf16_offset_to_byte(s: &str, utf16_offset: usize) -> usize {
305    let mut utf16_count = 0usize;
306    for (byte_idx, ch) in s.char_indices() {
307        if utf16_count >= utf16_offset {
308            return byte_idx;
309        }
310        utf16_count += ch.len_utf16();
311    }
312    s.len()
313}
314
315/// Convert a UTF-8 byte offset into a UTF-16 code unit count.
316///
317/// LSP `Position.character` is measured in UTF-16 code units.  Given a string
318/// and a byte offset into it, this returns how many UTF-16 units precede that
319/// offset — which is the correct LSP character value.
320pub(crate) fn byte_to_utf16(s: &str, byte_offset: usize) -> u32 {
321    s[..byte_offset.min(s.len())]
322        .chars()
323        .map(|c| c.len_utf16() as u32)
324        .sum()
325}
326
327/// Split a parameter list string on commas, respecting bracket nesting.
328///
329/// This avoids splitting inside default values like `array $x = [1, 2, 3]`.
330/// Each returned slice is trimmed of leading/trailing whitespace.
331pub(crate) fn split_params(s: &str) -> Vec<&str> {
332    let mut parts = Vec::new();
333    let mut depth = 0i32;
334    let mut start = 0;
335    for (i, ch) in s.char_indices() {
336        match ch {
337            '(' | '[' | '{' => depth += 1,
338            ')' | ']' | '}' => depth -= 1,
339            ',' if depth == 0 => {
340                parts.push(s[start..i].trim());
341                start = i + 1;
342            }
343            _ => {}
344        }
345    }
346    let last = s[start..].trim();
347    if !last.is_empty() {
348        parts.push(last);
349    }
350    parts
351}
352
353/// Extract the word (identifier) under the cursor, handling UTF-16 offsets.
354pub(crate) fn word_at(source: &str, position: Position) -> Option<String> {
355    // Use split('\n') rather than lines() so that a trailing newline produces a
356    // final empty entry — lines() silently drops it, causing word_at to return
357    // None for any cursor on the last line of a normally-saved PHP file.
358    let raw = source.split('\n').nth(position.line as usize)?;
359    let line = raw.strip_suffix('\r').unwrap_or(raw);
360    let char_offset = position.character as usize;
361
362    let chars: Vec<char> = line.chars().collect();
363
364    let mut utf16_len = 0usize;
365    let mut char_pos = 0usize;
366    for ch in &chars {
367        if utf16_len >= char_offset {
368            break;
369        }
370        utf16_len += ch.len_utf16();
371        char_pos += 1;
372    }
373
374    let total_utf16: usize = chars.iter().map(|c| c.len_utf16()).sum();
375    if char_offset > total_utf16 {
376        return None;
377    }
378
379    let is_word = |c: char| c.is_alphanumeric() || c == '_' || c == '$' || c == '\\';
380
381    let mut left = char_pos;
382    while left > 0 && is_word(chars[left - 1]) {
383        left -= 1;
384    }
385
386    let mut right = char_pos;
387    while right < chars.len() && is_word(chars[right]) {
388        right += 1;
389    }
390
391    if left == right {
392        return None;
393    }
394
395    let word: String = chars[left..right].iter().collect();
396    if word.is_empty() { None } else { Some(word) }
397}
398
399/// Return the LSP `Range` of the word (identifier) under the cursor.
400/// Uses the same word-boundary rules as `word_at`.
401pub(crate) fn word_range_at(source: &str, position: Position) -> Option<Range> {
402    let raw = source.split('\n').nth(position.line as usize)?;
403    let line = raw.strip_suffix('\r').unwrap_or(raw);
404    let char_offset = position.character as usize;
405
406    let chars: Vec<char> = line.chars().collect();
407
408    let mut utf16_len = 0usize;
409    let mut char_pos = 0usize;
410    for ch in &chars {
411        if utf16_len >= char_offset {
412            break;
413        }
414        utf16_len += ch.len_utf16();
415        char_pos += 1;
416    }
417
418    let total_utf16: usize = chars.iter().map(|c| c.len_utf16()).sum();
419    if char_offset > total_utf16 {
420        return None;
421    }
422
423    let is_word = |c: char| c.is_alphanumeric() || c == '_' || c == '$' || c == '\\';
424
425    let mut left = char_pos;
426    while left > 0 && is_word(chars[left - 1]) {
427        left -= 1;
428    }
429    let mut right = char_pos;
430    while right < chars.len() && is_word(chars[right]) {
431        right += 1;
432    }
433    if left == right {
434        return None;
435    }
436
437    let start_col = chars[..left]
438        .iter()
439        .map(|c| c.len_utf16() as u32)
440        .sum::<u32>();
441    let end_col = chars[..right]
442        .iter()
443        .map(|c| c.len_utf16() as u32)
444        .sum::<u32>();
445    Some(Range {
446        start: Position {
447            line: position.line,
448            character: start_col,
449        },
450        end: Position {
451            line: position.line,
452            character: end_col,
453        },
454    })
455}
456
457/// Extract the source text covered by an LSP `Range`.
458///
459/// `Range` positions use UTF-16 code-unit offsets; this function converts them
460/// correctly before slicing the UTF-8 source string.
461pub(crate) fn selected_text_range(source: &str, range: tower_lsp::lsp_types::Range) -> String {
462    let lines: Vec<&str> = source.lines().collect();
463    if range.start.line == range.end.line {
464        let line = match lines.get(range.start.line as usize) {
465            Some(l) => l,
466            None => return String::new(),
467        };
468        let start = utf16_offset_to_byte(line, range.start.character as usize);
469        let end = utf16_offset_to_byte(line, range.end.character as usize);
470        line[start..end].to_string()
471    } else {
472        let mut result = String::new();
473        for i in range.start.line..=range.end.line {
474            let line = match lines.get(i as usize) {
475                Some(l) => *l,
476                None => break,
477            };
478            if i == range.start.line {
479                let start = utf16_offset_to_byte(line, range.start.character as usize);
480                result.push_str(&line[start..]);
481            } else if i == range.end.line {
482                let end = utf16_offset_to_byte(line, range.end.character as usize);
483                result.push_str(&line[..end]);
484            } else {
485                result.push_str(line);
486            }
487            if i < range.end.line {
488                result.push('\n');
489            }
490        }
491        result
492    }
493}
494
495#[cfg(test)]
496mod tests {
497    use super::*;
498
499    #[test]
500    fn byte_to_utf16_ascii() {
501        assert_eq!(byte_to_utf16("hello", 3), 3);
502    }
503
504    #[test]
505    fn byte_to_utf16_multibyte_bmp() {
506        // "é" is U+00E9: 2 bytes in UTF-8, 1 code unit in UTF-16.
507        let s = "café";
508        assert_eq!(byte_to_utf16(s, 0), 0);
509        assert_eq!(byte_to_utf16(s, 3), 3); // up to "caf" (all ASCII)
510        assert_eq!(byte_to_utf16(s, 5), 4); // full string (é = 2 bytes → 1 UTF-16 unit)
511    }
512
513    #[test]
514    fn byte_to_utf16_surrogate_pair() {
515        // "😀" is U+1F600: 4 bytes in UTF-8, 2 code units in UTF-16 (surrogate pair).
516        let s = "a😀b";
517        assert_eq!(byte_to_utf16(s, 1), 1); // after "a"
518        assert_eq!(byte_to_utf16(s, 5), 3); // after "a😀" (emoji = 4 bytes → 2 UTF-16 units)
519        assert_eq!(byte_to_utf16(s, 6), 4); // full string
520    }
521
522    #[test]
523    fn byte_to_utf16_past_end_clamps() {
524        assert_eq!(byte_to_utf16("hi", 100), 2);
525    }
526
527    #[test]
528    fn utf16_offset_to_byte_ascii() {
529        assert_eq!(utf16_offset_to_byte("hello", 3), 3);
530    }
531
532    #[test]
533    fn utf16_offset_to_byte_surrogate_pair() {
534        // "a😀b": UTF-16 offset 1 → byte 1 (start of emoji), offset 3 → byte 5 (after emoji)
535        let s = "a😀b";
536        assert_eq!(utf16_offset_to_byte(s, 1), 1);
537        assert_eq!(utf16_offset_to_byte(s, 3), 5);
538    }
539
540    #[test]
541    fn byte_to_utf16_and_back_roundtrip() {
542        let s = "café 😀 world";
543        for (byte_idx, _) in s.char_indices() {
544            let utf16 = byte_to_utf16(s, byte_idx) as usize;
545            assert_eq!(utf16_offset_to_byte(s, utf16), byte_idx);
546        }
547    }
548
549    #[test]
550    fn word_at_last_line_with_trailing_newline() {
551        // Editors save files with a trailing newline; lines() drops the final
552        // empty entry, making word_at return None for cursors on the last line.
553        let src = "<?php\necho strlen($x);\n";
554        let pos = Position {
555            line: 1,
556            character: 6,
557        }; // "strlen" on line 1
558        let w = word_at(src, pos);
559        assert_eq!(
560            w.as_deref(),
561            Some("strlen"),
562            "word_at must work on lines before the trailing newline"
563        );
564        // Position on the final empty line produced by the trailing newline.
565        let last_line = Position {
566            line: 2,
567            character: 0,
568        };
569        // Should return None (empty line), but must not panic.
570        let _ = word_at(src, last_line);
571    }
572
573    #[test]
574    fn word_at_crlf_line_endings() {
575        let src = "<?php\r\nfunction foo() {}\r\n";
576        let pos = Position {
577            line: 1,
578            character: 9,
579        }; // "foo"
580        let w = word_at(src, pos);
581        assert_eq!(
582            w.as_deref(),
583            Some("foo"),
584            "word_at must handle CRLF line endings"
585        );
586    }
587
588    #[test]
589    fn is_php_builtin_asin_recognized() {
590        // asin was out of order in BUILTINS, causing binary_search to miss it.
591        assert!(
592            is_php_builtin("asin"),
593            "asin must be recognised as a PHP builtin"
594        );
595        assert!(
596            is_php_builtin("atan"),
597            "atan must be recognised as a PHP builtin"
598        );
599        assert!(
600            is_php_builtin("krsort"),
601            "krsort must be recognised as a PHP builtin"
602        );
603        assert!(
604            is_php_builtin("strcasecmp"),
605            "strcasecmp must be recognised as a PHP builtin"
606        );
607        assert!(
608            is_php_builtin("strncasecmp"),
609            "strncasecmp must be recognised as a PHP builtin"
610        );
611        assert!(
612            is_php_builtin("strip_tags"),
613            "strip_tags must be recognised as a PHP builtin"
614        );
615    }
616}
php_lsp/util.rs

php_lsp/
util.rs