Skip to main content

php_lsp/
util.rs

1use tower_lsp::lsp_types::{Position, Range};
2
3/// Returns `true` if `query` matches `candidate` using camelCase/underscore
4/// abbreviation rules.
5///
6/// Rules (applied in order, first match wins):
7/// 1. `candidate` starts with `query` (case-insensitive prefix match).
8/// 2. Every character of `query` matches either the start of a camelCase word
9///    (uppercase letter preceded by lowercase) or the character after `_` in
10///    the candidate.
11///
12/// Examples:
13/// - `"GRF"` matches `"getRecentFiles"`
14/// - `"str_r"` matches `"str_replace"`
15/// - `"srp"` matches `"str_replace"`
16pub(crate) fn fuzzy_camel_match(query: &str, candidate: &str) -> bool {
17    if query.is_empty() {
18        return true;
19    }
20    let ql: String = query.to_lowercase();
21    let cl: String = candidate.to_lowercase();
22    // Fast path: plain prefix
23    if cl.starts_with(&ql) {
24        return true;
25    }
26    // Camel / underscore abbreviation
27    let qchars: Vec<char> = ql.chars().collect();
28    let cchars: Vec<char> = candidate.chars().collect();
29    let mut qi = 0usize;
30    let mut ci = 0usize;
31    while qi < qchars.len() && ci < cchars.len() {
32        let qc = qchars[qi];
33        // A "word boundary" in the candidate is: position 0, after '_' or '$', or
34        // an uppercase letter after a lowercase letter (camelCase transition).
35        let is_boundary = ci == 0
36            || cchars[ci - 1] == '_'
37            || cchars[ci - 1] == '$'
38            || (cchars[ci].is_uppercase() && ci > 0 && cchars[ci - 1].is_lowercase());
39        if is_boundary && cchars[ci].to_lowercase().next() == Some(qc) {
40            qi += 1;
41        }
42        ci += 1;
43    }
44    qi == qchars.len()
45}
46
47/// Compute a sort key for a completion item so that items matching the query
48/// by plain prefix sort before camel/underscore abbreviation matches.
49/// Lower string = higher priority.
50pub(crate) fn camel_sort_key(query: &str, label: &str) -> String {
51    let lq = query.to_lowercase();
52    let ll = label.to_lowercase();
53    if ll.starts_with(&lq) {
54        format!("0{}", ll)
55    } else {
56        format!("1{}", ll)
57    }
58}
59
60/// Return `true` if `name` is a known PHP built-in function.
61/// Used by hover to generate php.net links.
62pub(crate) fn is_php_builtin(name: &str) -> bool {
63    // Sorted for binary search.
64    const BUILTINS: &[&str] = &[
65        "abs",
66        "acos",
67        "addslashes",
68        "array_chunk",
69        "array_combine",
70        "array_diff",
71        "array_fill",
72        "array_fill_keys",
73        "array_filter",
74        "array_flip",
75        "array_intersect",
76        "array_key_exists",
77        "array_keys",
78        "array_map",
79        "array_merge",
80        "array_pad",
81        "array_pop",
82        "array_push",
83        "array_reduce",
84        "array_replace",
85        "array_reverse",
86        "array_search",
87        "array_shift",
88        "array_slice",
89        "array_splice",
90        "array_unique",
91        "array_unshift",
92        "array_values",
93        "array_walk",
94        "array_walk_recursive",
95        "arsort",
96        "asin",
97        "asort",
98        "atan",
99        "atan2",
100        "base64_decode",
101        "base64_encode",
102        "basename",
103        "boolval",
104        "call_user_func",
105        "call_user_func_array",
106        "ceil",
107        "checkdate",
108        "class_exists",
109        "closedir",
110        "compact",
111        "constant",
112        "copy",
113        "cos",
114        "date",
115        "date_add",
116        "date_create",
117        "date_diff",
118        "date_format",
119        "date_sub",
120        "define",
121        "defined",
122        "die",
123        "dirname",
124        "empty",
125        "exit",
126        "exp",
127        "explode",
128        "extract",
129        "fclose",
130        "feof",
131        "fgets",
132        "file_exists",
133        "file_get_contents",
134        "file_put_contents",
135        "floatval",
136        "floor",
137        "fmod",
138        "fopen",
139        "fputs",
140        "fread",
141        "fseek",
142        "ftell",
143        "function_exists",
144        "get_class",
145        "get_parent_class",
146        "gettype",
147        "glob",
148        "hash",
149        "header",
150        "headers_sent",
151        "htmlentities",
152        "htmlspecialchars",
153        "http_build_query",
154        "implode",
155        "in_array",
156        "intdiv",
157        "interface_exists",
158        "intval",
159        "is_a",
160        "is_array",
161        "is_bool",
162        "is_callable",
163        "is_dir",
164        "is_double",
165        "is_file",
166        "is_finite",
167        "is_float",
168        "is_infinite",
169        "is_int",
170        "is_integer",
171        "is_long",
172        "is_nan",
173        "is_null",
174        "is_numeric",
175        "is_object",
176        "is_readable",
177        "is_string",
178        "is_subclass_of",
179        "is_writable",
180        "isset",
181        "join",
182        "json_decode",
183        "json_encode",
184        "krsort",
185        "ksort",
186        "lcfirst",
187        "list",
188        "log",
189        "ltrim",
190        "max",
191        "md5",
192        "method_exists",
193        "microtime",
194        "min",
195        "mkdir",
196        "mktime",
197        "mt_rand",
198        "nl2br",
199        "number_format",
200        "ob_end_clean",
201        "ob_get_clean",
202        "ob_start",
203        "opendir",
204        "parse_str",
205        "parse_url",
206        "pathinfo",
207        "pi",
208        "pow",
209        "preg_match",
210        "preg_match_all",
211        "preg_quote",
212        "preg_replace",
213        "preg_split",
214        "print_r",
215        "printf",
216        "property_exists",
217        "rand",
218        "random_int",
219        "rawurldecode",
220        "rawurlencode",
221        "readdir",
222        "realpath",
223        "rename",
224        "rewind",
225        "rmdir",
226        "round",
227        "rsort",
228        "rtrim",
229        "scandir",
230        "serialize",
231        "session_destroy",
232        "session_start",
233        "setcookie",
234        "settype",
235        "sha1",
236        "sin",
237        "sleep",
238        "sort",
239        "sprintf",
240        "sqrt",
241        "str_contains",
242        "str_ends_with",
243        "str_pad",
244        "str_repeat",
245        "str_replace",
246        "str_split",
247        "str_starts_with",
248        "str_word_count",
249        "strcasecmp",
250        "strcmp",
251        "strip_tags",
252        "stripslashes",
253        "stristr",
254        "strlen",
255        "strncasecmp",
256        "strncmp",
257        "strpos",
258        "strrpos",
259        "strstr",
260        "strtolower",
261        "strtotime",
262        "strtoupper",
263        "strval",
264        "substr",
265        "substr_count",
266        "substr_replace",
267        "tan",
268        "time",
269        "trim",
270        "uasort",
271        "ucfirst",
272        "ucwords",
273        "uksort",
274        "unlink",
275        "unserialize",
276        "unset",
277        "urldecode",
278        "urlencode",
279        "usleep",
280        "usort",
281        "var_dump",
282        "var_export",
283        "vsprintf",
284    ];
285    debug_assert!(
286        BUILTINS.windows(2).all(|w| w[0] <= w[1]),
287        "BUILTINS must be sorted for binary_search"
288    );
289    BUILTINS.binary_search(&name).is_ok()
290}
291
292/// Build the php.net documentation URL for a built-in function name.
293pub(crate) fn php_doc_url(name: &str) -> String {
294    // php.net uses underscores replaced with dashes in the URL path.
295    let slug = name.replace('_', "-");
296    format!("https://www.php.net/function.{}", slug)
297}
298
299/// Convert a UTF-16 code unit offset into a UTF-8 byte offset for `s`.
300///
301/// LSP positions use UTF-16 code units; Rust strings are UTF-8.  This helper
302/// walks the string's `char_indices`, accumulating UTF-16 units, and returns
303/// the byte index of the character at the given UTF-16 offset.  If the offset
304/// is past the end of the string, `s.len()` is returned.
305pub(crate) fn utf16_offset_to_byte(s: &str, utf16_offset: usize) -> usize {
306    let mut utf16_count = 0usize;
307    for (byte_idx, ch) in s.char_indices() {
308        if utf16_count >= utf16_offset {
309            return byte_idx;
310        }
311        utf16_count += ch.len_utf16();
312    }
313    s.len()
314}
315
316/// Convert a UTF-8 byte offset into a UTF-16 code unit count.
317///
318/// LSP `Position.character` is measured in UTF-16 code units.  Given a string
319/// and a byte offset into it, this returns how many UTF-16 units precede that
320/// offset — which is the correct LSP character value.
321pub(crate) fn byte_to_utf16(s: &str, byte_offset: usize) -> u32 {
322    s[..byte_offset.min(s.len())]
323        .chars()
324        .map(|c| c.len_utf16() as u32)
325        .sum()
326}
327
328/// Split a parameter list string on commas, respecting bracket nesting.
329///
330/// This avoids splitting inside default values like `array $x = [1, 2, 3]`.
331/// Each returned slice is trimmed of leading/trailing whitespace.
332pub(crate) fn split_params(s: &str) -> Vec<&str> {
333    let mut parts = Vec::new();
334    let mut depth = 0i32;
335    let mut start = 0;
336    for (i, ch) in s.char_indices() {
337        match ch {
338            '(' | '[' | '{' => depth += 1,
339            ')' | ']' | '}' => depth -= 1,
340            ',' if depth == 0 => {
341                parts.push(s[start..i].trim());
342                start = i + 1;
343            }
344            _ => {}
345        }
346    }
347    let last = s[start..].trim();
348    if !last.is_empty() {
349        parts.push(last);
350    }
351    parts
352}
353
354/// Extract the word (identifier) under the cursor, handling UTF-16 offsets.
355fn char_range_for_word(line: &str, char_offset: usize) -> Option<(usize, usize)> {
356    let chars: Vec<char> = line.chars().collect();
357    let mut utf16_len = 0usize;
358    let mut char_pos = 0usize;
359    for ch in &chars {
360        if utf16_len >= char_offset {
361            break;
362        }
363        utf16_len += ch.len_utf16();
364        char_pos += 1;
365    }
366    let total_utf16: usize = chars.iter().map(|c| c.len_utf16()).sum();
367    if char_offset > total_utf16 {
368        return None;
369    }
370    let is_word = |c: char| c.is_alphanumeric() || c == '_' || c == '$' || c == '\\';
371    let mut left = char_pos;
372    while left > 0 && is_word(chars[left - 1]) {
373        left -= 1;
374    }
375    let mut right = char_pos;
376    while right < chars.len() && is_word(chars[right]) {
377        right += 1;
378    }
379    if left == right {
380        None
381    } else {
382        Some((left, right))
383    }
384}
385
386pub(crate) fn word_at_position(source: &str, position: Position) -> Option<String> {
387    // Use split('\n') rather than lines() so that a trailing newline produces a
388    // final empty entry — lines() silently drops it, causing word_at_position to return
389    // None for any cursor on the last line of a normally-saved PHP file.
390    let raw = source.split('\n').nth(position.line as usize)?;
391    let line = raw.strip_suffix('\r').unwrap_or(raw);
392    let char_offset = position.character as usize;
393    let chars: Vec<char> = line.chars().collect();
394    let (left, right) = char_range_for_word(line, char_offset)?;
395    let word: String = chars[left..right].iter().collect();
396    if word.is_empty() { None } else { Some(word) }
397}
398
399/// Return the LSP `Range` of the word (identifier) under the cursor.
400/// Uses the same word-boundary rules as `word_at_position`.
401pub(crate) fn word_range_at(source: &str, position: Position) -> Option<Range> {
402    let raw = source.split('\n').nth(position.line as usize)?;
403    let line = raw.strip_suffix('\r').unwrap_or(raw);
404    let char_offset = position.character as usize;
405    let chars: Vec<char> = line.chars().collect();
406    let (left, right) = char_range_for_word(line, char_offset)?;
407    let start_col = chars[..left]
408        .iter()
409        .map(|c| c.len_utf16() as u32)
410        .sum::<u32>();
411    let end_col = chars[..right]
412        .iter()
413        .map(|c| c.len_utf16() as u32)
414        .sum::<u32>();
415    Some(Range {
416        start: Position {
417            line: position.line,
418            character: start_col,
419        },
420        end: Position {
421            line: position.line,
422            character: end_col,
423        },
424    })
425}
426
427/// Extract the source text covered by an LSP `Range`.
428///
429/// `Range` positions use UTF-16 code-unit offsets; this function converts them
430/// correctly before slicing the UTF-8 source string.
431pub(crate) fn selected_text_range(source: &str, range: tower_lsp::lsp_types::Range) -> String {
432    let lines: Vec<&str> = source.lines().collect();
433    if range.start.line == range.end.line {
434        let line = match lines.get(range.start.line as usize) {
435            Some(l) => l,
436            None => return String::new(),
437        };
438        let start = utf16_offset_to_byte(line, range.start.character as usize);
439        let end = utf16_offset_to_byte(line, range.end.character as usize);
440        line[start..end].to_string()
441    } else {
442        let mut result = String::new();
443        for i in range.start.line..=range.end.line {
444            let line = match lines.get(i as usize) {
445                Some(l) => *l,
446                None => break,
447            };
448            if i == range.start.line {
449                let start = utf16_offset_to_byte(line, range.start.character as usize);
450                result.push_str(&line[start..]);
451            } else if i == range.end.line {
452                let end = utf16_offset_to_byte(line, range.end.character as usize);
453                result.push_str(&line[..end]);
454            } else {
455                result.push_str(line);
456            }
457            if i < range.end.line {
458                result.push('\n');
459            }
460        }
461        result
462    }
463}
464
465/// Count the UTF-16 code units in a string.
466/// Needed for LSP `Position.character` calculations, which use UTF-16 offsets.
467pub fn utf16_code_units(s: &str) -> u32 {
468    s.chars().map(|c| c.len_utf16() as u32).sum()
469}
470
471/// Strip the leading `$` sigil from a variable name, if present.
472/// Variables are stored both ways: `$var` in source, `var` in symbol tables.
473pub fn strip_variable_sigil(word: &str) -> &str {
474    word.strip_prefix('$').unwrap_or(word)
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn byte_to_utf16_ascii() {
483        assert_eq!(byte_to_utf16("hello", 3), 3);
484    }
485
486    #[test]
487    fn byte_to_utf16_multibyte_bmp() {
488        // "é" is U+00E9: 2 bytes in UTF-8, 1 code unit in UTF-16.
489        let s = "café";
490        assert_eq!(byte_to_utf16(s, 0), 0);
491        assert_eq!(byte_to_utf16(s, 3), 3); // up to "caf" (all ASCII)
492        assert_eq!(byte_to_utf16(s, 5), 4); // full string (é = 2 bytes → 1 UTF-16 unit)
493    }
494
495    #[test]
496    fn byte_to_utf16_surrogate_pair() {
497        // "😀" is U+1F600: 4 bytes in UTF-8, 2 code units in UTF-16 (surrogate pair).
498        let s = "a😀b";
499        assert_eq!(byte_to_utf16(s, 1), 1); // after "a"
500        assert_eq!(byte_to_utf16(s, 5), 3); // after "a😀" (emoji = 4 bytes → 2 UTF-16 units)
501        assert_eq!(byte_to_utf16(s, 6), 4); // full string
502    }
503
504    #[test]
505    fn byte_to_utf16_past_end_clamps() {
506        assert_eq!(byte_to_utf16("hi", 100), 2);
507    }
508
509    #[test]
510    fn utf16_offset_to_byte_ascii() {
511        assert_eq!(utf16_offset_to_byte("hello", 3), 3);
512    }
513
514    #[test]
515    fn utf16_offset_to_byte_surrogate_pair() {
516        // "a😀b": UTF-16 offset 1 → byte 1 (start of emoji), offset 3 → byte 5 (after emoji)
517        let s = "a😀b";
518        assert_eq!(utf16_offset_to_byte(s, 1), 1);
519        assert_eq!(utf16_offset_to_byte(s, 3), 5);
520    }
521
522    #[test]
523    fn byte_to_utf16_and_back_roundtrip() {
524        let s = "café 😀 world";
525        for (byte_idx, _) in s.char_indices() {
526            let utf16 = byte_to_utf16(s, byte_idx) as usize;
527            assert_eq!(utf16_offset_to_byte(s, utf16), byte_idx);
528        }
529    }
530
531    #[test]
532    fn word_at_last_line_with_trailing_newline() {
533        // Editors save files with a trailing newline; lines() drops the final
534        // empty entry, making word_at return None for cursors on the last line.
535        let src = "<?php\necho strlen($x);\n";
536        let pos = Position {
537            line: 1,
538            character: 6,
539        }; // "strlen" on line 1
540        let w = word_at_position(src, pos);
541        assert_eq!(
542            w.as_deref(),
543            Some("strlen"),
544            "word_at_position must work on lines before the trailing newline"
545        );
546        // Position on the final empty line produced by the trailing newline.
547        let last_line = Position {
548            line: 2,
549            character: 0,
550        };
551        // Should return None (empty line), but must not panic.
552        let _ = word_at_position(src, last_line);
553    }
554
555    #[test]
556    fn word_at_crlf_line_endings() {
557        let src = "<?php\r\nfunction foo() {}\r\n";
558        let pos = Position {
559            line: 1,
560            character: 9,
561        }; // "foo"
562        let w = word_at_position(src, pos);
563        assert_eq!(
564            w.as_deref(),
565            Some("foo"),
566            "word_at_position must handle CRLF line endings"
567        );
568    }
569
570    #[test]
571    fn is_php_builtin_asin_recognized() {
572        // asin was out of order in BUILTINS, causing binary_search to miss it.
573        assert!(
574            is_php_builtin("asin"),
575            "asin must be recognised as a PHP builtin"
576        );
577        assert!(
578            is_php_builtin("atan"),
579            "atan must be recognised as a PHP builtin"
580        );
581        assert!(
582            is_php_builtin("krsort"),
583            "krsort must be recognised as a PHP builtin"
584        );
585        assert!(
586            is_php_builtin("strcasecmp"),
587            "strcasecmp must be recognised as a PHP builtin"
588        );
589        assert!(
590            is_php_builtin("strncasecmp"),
591            "strncasecmp must be recognised as a PHP builtin"
592        );
593        assert!(
594            is_php_builtin("strip_tags"),
595            "strip_tags must be recognised as a PHP builtin"
596        );
597    }
598}