reovim_kernel/mm/
word.rs

1//! Word boundary detection for text navigation.
2//!
3//! This module provides pure functions for classifying characters and
4//! finding word boundaries. It supports both "small words" (w/b motions)
5//! and "big words" (W/B motions) as defined by vim.
6//!
7//! # Design Philosophy
8//!
9//! Following the kernel "mechanism, not policy" principle:
10//! - Pure functions operating on `&[char]` slices
11//! - No buffer or position knowledge
12//! - No movement commands (that's policy in modules)
13//!
14//! # Word Types
15//!
16//! - **Small words** (`w`, `b`, `e`): Sequences of word characters (alphanumeric + underscore)
17//!   or sequences of punctuation, separated by whitespace or character type changes.
18//! - **Big words** (`W`, `B`, `E`): Any non-whitespace sequences, separated only by whitespace.
19//!
20//! # Example
21//!
22//! ```
23//! use reovim_kernel::api::v1::*;
24//!
25//! let text: Vec<char> = "hello_world foo.bar".chars().collect();
26//!
27//! // Find word boundaries for small word at position 0
28//! let (start, end) = word_bounds(&text, 0, WordType::Small);
29//! assert_eq!(start, 0);
30//! assert_eq!(end, 10); // "hello_world"
31//!
32//! // Big word treats foo.bar as one word
33//! let (start, end) = word_bounds(&text, 12, WordType::Big);
34//! assert_eq!(start, 12);
35//! assert_eq!(end, 18); // "foo.bar"
36//! ```
37
38/// Character classification for word boundary detection.
39///
40/// Characters are classified into three categories:
41/// - `Word`: Alphanumeric and underscore (the "keyword" characters)
42/// - `Punctuation`: Non-whitespace, non-word characters
43/// - `Whitespace`: Spaces, tabs, newlines, etc.
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45pub enum CharKind {
46    /// Word character: alphanumeric or underscore.
47    Word,
48    /// Punctuation: non-whitespace, non-word.
49    Punctuation,
50    /// Whitespace: space, tab, newline, etc.
51    Whitespace,
52}
53
54/// Word type for boundary detection.
55///
56/// Determines how word boundaries are calculated:
57/// - `Small`: Traditional vim "word" (w/b/e motions)
58/// - `Big`: Traditional vim "WORD" (W/B/E motions)
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
60pub enum WordType {
61    /// Small word: word characters only, punctuation is separate.
62    #[default]
63    Small,
64    /// Big word: any non-whitespace sequence.
65    Big,
66}
67
68/// Classify a character.
69///
70/// # Examples
71///
72/// ```
73/// use reovim_kernel::api::v1::*;
74///
75/// assert_eq!(char_kind('a'), CharKind::Word);
76/// assert_eq!(char_kind('_'), CharKind::Word);
77/// assert_eq!(char_kind('5'), CharKind::Word);
78/// assert_eq!(char_kind('.'), CharKind::Punctuation);
79/// assert_eq!(char_kind(' '), CharKind::Whitespace);
80/// ```
81#[must_use]
82pub fn char_kind(c: char) -> CharKind {
83    if c.is_whitespace() {
84        CharKind::Whitespace
85    } else if c.is_alphanumeric() || c == '_' {
86        CharKind::Word
87    } else {
88        CharKind::Punctuation
89    }
90}
91
92/// Find the start of the word containing the given position.
93///
94/// Searches backward from `pos` to find where the current word begins.
95/// The definition of "word" depends on `word_type`.
96///
97/// # Arguments
98///
99/// * `chars` - The character slice to search
100/// * `pos` - The starting position (0-indexed)
101/// * `word_type` - Whether to use small or big word semantics
102///
103/// # Returns
104///
105/// The index of the first character of the word, or 0 if at start.
106///
107/// # Examples
108///
109/// ```
110/// use reovim_kernel::api::v1::*;
111///
112/// let text: Vec<char> = "hello world".chars().collect();
113/// assert_eq!(word_start(&text, 3, WordType::Small), 0); // 'l' is part of "hello"
114/// assert_eq!(word_start(&text, 8, WordType::Small), 6); // 'r' is part of "world"
115/// ```
116#[must_use]
117pub fn word_start(chars: &[char], pos: usize, word_type: WordType) -> usize {
118    if chars.is_empty() || pos == 0 {
119        return 0;
120    }
121
122    let pos = pos.min(chars.len() - 1);
123    let current_kind = char_kind(chars[pos]);
124
125    // For whitespace, we're not in a word
126    if current_kind == CharKind::Whitespace {
127        return pos;
128    }
129
130    let matches = |c: char| match word_type {
131        WordType::Small => char_kind(c) == current_kind,
132        WordType::Big => !c.is_whitespace(),
133    };
134
135    let mut idx = pos;
136    while idx > 0 && matches(chars[idx - 1]) {
137        idx -= 1;
138    }
139    idx
140}
141
142/// Find the end of the word containing the given position.
143///
144/// Searches forward from `pos` to find where the current word ends.
145/// The definition of "word" depends on `word_type`.
146///
147/// # Arguments
148///
149/// * `chars` - The character slice to search
150/// * `pos` - The starting position (0-indexed)
151/// * `word_type` - Whether to use small or big word semantics
152///
153/// # Returns
154///
155/// The index of the last character of the word.
156///
157/// # Examples
158///
159/// ```
160/// use reovim_kernel::api::v1::*;
161///
162/// let text: Vec<char> = "hello world".chars().collect();
163/// assert_eq!(word_end(&text, 0, WordType::Small), 4); // "hello" ends at 4
164/// assert_eq!(word_end(&text, 6, WordType::Small), 10); // "world" ends at 10
165/// ```
166#[must_use]
167pub fn word_end(chars: &[char], pos: usize, word_type: WordType) -> usize {
168    if chars.is_empty() {
169        return 0;
170    }
171
172    let pos = pos.min(chars.len() - 1);
173    let current_kind = char_kind(chars[pos]);
174
175    // For whitespace, we're not in a word
176    if current_kind == CharKind::Whitespace {
177        return pos;
178    }
179
180    let matches = |c: char| match word_type {
181        WordType::Small => char_kind(c) == current_kind,
182        WordType::Big => !c.is_whitespace(),
183    };
184
185    let mut idx = pos;
186    while idx < chars.len() - 1 && matches(chars[idx + 1]) {
187        idx += 1;
188    }
189    idx
190}
191
192/// Find both word boundaries around a position.
193///
194/// Convenience function that returns both the start and end of the word
195/// containing the given position.
196///
197/// # Arguments
198///
199/// * `chars` - The character slice to search
200/// * `pos` - The position within the word
201/// * `word_type` - Whether to use small or big word semantics
202///
203/// # Returns
204///
205/// A tuple of (`start_index`, `end_index`) for the word.
206///
207/// # Examples
208///
209/// ```
210/// use reovim_kernel::api::v1::*;
211///
212/// let text: Vec<char> = "hello world".chars().collect();
213/// let (start, end) = word_bounds(&text, 2, WordType::Small);
214/// assert_eq!(start, 0);
215/// assert_eq!(end, 4);
216/// ```
217#[must_use]
218pub fn word_bounds(chars: &[char], pos: usize, word_type: WordType) -> (usize, usize) {
219    (word_start(chars, pos, word_type), word_end(chars, pos, word_type))
220}
221
222/// Find the start of the next word.
223///
224/// Searches forward from `pos` to find the beginning of the next word.
225/// Skips over the current word and any whitespace.
226///
227/// # Arguments
228///
229/// * `chars` - The character slice to search
230/// * `pos` - The starting position
231/// * `word_type` - Whether to use small or big word semantics
232///
233/// # Returns
234///
235/// The index of the first character of the next word, or `chars.len()`
236/// if no next word exists.
237#[must_use]
238#[cfg_attr(coverage_nightly, coverage(off))]
239pub fn next_word_start(chars: &[char], pos: usize, word_type: WordType) -> usize {
240    if chars.is_empty() {
241        return 0;
242    }
243
244    let mut idx = pos.min(chars.len() - 1);
245    let current_kind = char_kind(chars[idx]);
246
247    // Skip current word (or whitespace)
248    match word_type {
249        WordType::Small => {
250            // Skip same-kind characters
251            while idx < chars.len() && char_kind(chars[idx]) == current_kind {
252                idx += 1;
253            }
254        }
255        WordType::Big => {
256            // Skip non-whitespace
257            if current_kind != CharKind::Whitespace {
258                while idx < chars.len() && !chars[idx].is_whitespace() {
259                    idx += 1;
260                }
261            }
262        }
263    }
264
265    // Skip whitespace
266    while idx < chars.len() && chars[idx].is_whitespace() {
267        idx += 1;
268    }
269
270    idx
271}
272
273/// Find the end of the next word.
274///
275/// Searches forward from `pos` to find the end of the next word.
276/// If already at a word end, moves to the end of the following word.
277///
278/// # Arguments
279///
280/// * `chars` - The character slice to search
281/// * `pos` - The starting position
282/// * `word_type` - Whether to use small or big word semantics
283///
284/// # Returns
285///
286/// The index of the last character of the next word.
287#[must_use]
288#[cfg_attr(coverage_nightly, coverage(off))]
289pub fn next_word_end(chars: &[char], pos: usize, word_type: WordType) -> usize {
290    if chars.is_empty() {
291        return 0;
292    }
293
294    let mut idx = pos.min(chars.len() - 1);
295
296    // If not at end of current word, go to end of current word
297    if idx < chars.len() - 1 {
298        let current_kind = char_kind(chars[idx]);
299        let next_kind = char_kind(chars[idx + 1]);
300
301        let same_word = match word_type {
302            WordType::Small => current_kind == next_kind && current_kind != CharKind::Whitespace,
303            WordType::Big => {
304                current_kind != CharKind::Whitespace && next_kind != CharKind::Whitespace
305            }
306        };
307
308        if same_word {
309            // Move to end of current word
310            return word_end(chars, idx, word_type);
311        }
312    }
313
314    // Move to start of next word, then find its end
315    idx += 1;
316    while idx < chars.len() && chars[idx].is_whitespace() {
317        idx += 1;
318    }
319
320    if idx >= chars.len() {
321        return chars.len().saturating_sub(1);
322    }
323
324    word_end(chars, idx, word_type)
325}
reovim_kernel/mm/word.rs

reovim_kernel/mm/
word.rs