reovim_kernel/mm/word.rs
1//! Word boundary detection for text navigation.
2//!
3//! This module provides pure functions for classifying characters and
4//! finding word boundaries. It supports both "small words" (w/b motions)
5//! and "big words" (W/B motions) as defined by vim.
6//!
7//! # Design Philosophy
8//!
9//! Following the kernel "mechanism, not policy" principle:
10//! - Pure functions operating on `&[char]` slices
11//! - No buffer or position knowledge
12//! - No movement commands (that's policy in modules)
13//!
14//! # Word Types
15//!
16//! - **Small words** (`w`, `b`, `e`): Sequences of word characters (alphanumeric + underscore)
17//! or sequences of punctuation, separated by whitespace or character type changes.
18//! - **Big words** (`W`, `B`, `E`): Any non-whitespace sequences, separated only by whitespace.
19//!
20//! # Example
21//!
22//! ```
23//! use reovim_kernel::api::v1::*;
24//!
25//! let text: Vec<char> = "hello_world foo.bar".chars().collect();
26//!
27//! // Find word boundaries for small word at position 0
28//! let (start, end) = word_bounds(&text, 0, WordType::Small);
29//! assert_eq!(start, 0);
30//! assert_eq!(end, 10); // "hello_world"
31//!
32//! // Big word treats foo.bar as one word
33//! let (start, end) = word_bounds(&text, 12, WordType::Big);
34//! assert_eq!(start, 12);
35//! assert_eq!(end, 18); // "foo.bar"
36//! ```
37
38/// Character classification for word boundary detection.
39///
40/// Characters are classified into three categories:
41/// - `Word`: Alphanumeric and underscore (the "keyword" characters)
42/// - `Punctuation`: Non-whitespace, non-word characters
43/// - `Whitespace`: Spaces, tabs, newlines, etc.
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
45pub enum CharKind {
46 /// Word character: alphanumeric or underscore.
47 Word,
48 /// Punctuation: non-whitespace, non-word.
49 Punctuation,
50 /// Whitespace: space, tab, newline, etc.
51 Whitespace,
52}
53
54/// Word type for boundary detection.
55///
56/// Determines how word boundaries are calculated:
57/// - `Small`: Traditional vim "word" (w/b/e motions)
58/// - `Big`: Traditional vim "WORD" (W/B/E motions)
59#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
60pub enum WordType {
61 /// Small word: word characters only, punctuation is separate.
62 #[default]
63 Small,
64 /// Big word: any non-whitespace sequence.
65 Big,
66}
67
68/// Classify a character.
69///
70/// # Examples
71///
72/// ```
73/// use reovim_kernel::api::v1::*;
74///
75/// assert_eq!(char_kind('a'), CharKind::Word);
76/// assert_eq!(char_kind('_'), CharKind::Word);
77/// assert_eq!(char_kind('5'), CharKind::Word);
78/// assert_eq!(char_kind('.'), CharKind::Punctuation);
79/// assert_eq!(char_kind(' '), CharKind::Whitespace);
80/// ```
81#[must_use]
82pub fn char_kind(c: char) -> CharKind {
83 if c.is_whitespace() {
84 CharKind::Whitespace
85 } else if c.is_alphanumeric() || c == '_' {
86 CharKind::Word
87 } else {
88 CharKind::Punctuation
89 }
90}
91
92/// Find the start of the word containing the given position.
93///
94/// Searches backward from `pos` to find where the current word begins.
95/// The definition of "word" depends on `word_type`.
96///
97/// # Arguments
98///
99/// * `chars` - The character slice to search
100/// * `pos` - The starting position (0-indexed)
101/// * `word_type` - Whether to use small or big word semantics
102///
103/// # Returns
104///
105/// The index of the first character of the word, or 0 if at start.
106///
107/// # Examples
108///
109/// ```
110/// use reovim_kernel::api::v1::*;
111///
112/// let text: Vec<char> = "hello world".chars().collect();
113/// assert_eq!(word_start(&text, 3, WordType::Small), 0); // 'l' is part of "hello"
114/// assert_eq!(word_start(&text, 8, WordType::Small), 6); // 'r' is part of "world"
115/// ```
116#[must_use]
117pub fn word_start(chars: &[char], pos: usize, word_type: WordType) -> usize {
118 if chars.is_empty() || pos == 0 {
119 return 0;
120 }
121
122 let pos = pos.min(chars.len() - 1);
123 let current_kind = char_kind(chars[pos]);
124
125 // For whitespace, we're not in a word
126 if current_kind == CharKind::Whitespace {
127 return pos;
128 }
129
130 let matches = |c: char| match word_type {
131 WordType::Small => char_kind(c) == current_kind,
132 WordType::Big => !c.is_whitespace(),
133 };
134
135 let mut idx = pos;
136 while idx > 0 && matches(chars[idx - 1]) {
137 idx -= 1;
138 }
139 idx
140}
141
142/// Find the end of the word containing the given position.
143///
144/// Searches forward from `pos` to find where the current word ends.
145/// The definition of "word" depends on `word_type`.
146///
147/// # Arguments
148///
149/// * `chars` - The character slice to search
150/// * `pos` - The starting position (0-indexed)
151/// * `word_type` - Whether to use small or big word semantics
152///
153/// # Returns
154///
155/// The index of the last character of the word.
156///
157/// # Examples
158///
159/// ```
160/// use reovim_kernel::api::v1::*;
161///
162/// let text: Vec<char> = "hello world".chars().collect();
163/// assert_eq!(word_end(&text, 0, WordType::Small), 4); // "hello" ends at 4
164/// assert_eq!(word_end(&text, 6, WordType::Small), 10); // "world" ends at 10
165/// ```
166#[must_use]
167pub fn word_end(chars: &[char], pos: usize, word_type: WordType) -> usize {
168 if chars.is_empty() {
169 return 0;
170 }
171
172 let pos = pos.min(chars.len() - 1);
173 let current_kind = char_kind(chars[pos]);
174
175 // For whitespace, we're not in a word
176 if current_kind == CharKind::Whitespace {
177 return pos;
178 }
179
180 let matches = |c: char| match word_type {
181 WordType::Small => char_kind(c) == current_kind,
182 WordType::Big => !c.is_whitespace(),
183 };
184
185 let mut idx = pos;
186 while idx < chars.len() - 1 && matches(chars[idx + 1]) {
187 idx += 1;
188 }
189 idx
190}
191
192/// Find both word boundaries around a position.
193///
194/// Convenience function that returns both the start and end of the word
195/// containing the given position.
196///
197/// # Arguments
198///
199/// * `chars` - The character slice to search
200/// * `pos` - The position within the word
201/// * `word_type` - Whether to use small or big word semantics
202///
203/// # Returns
204///
205/// A tuple of (`start_index`, `end_index`) for the word.
206///
207/// # Examples
208///
209/// ```
210/// use reovim_kernel::api::v1::*;
211///
212/// let text: Vec<char> = "hello world".chars().collect();
213/// let (start, end) = word_bounds(&text, 2, WordType::Small);
214/// assert_eq!(start, 0);
215/// assert_eq!(end, 4);
216/// ```
217#[must_use]
218pub fn word_bounds(chars: &[char], pos: usize, word_type: WordType) -> (usize, usize) {
219 (word_start(chars, pos, word_type), word_end(chars, pos, word_type))
220}
221
222/// Find the start of the next word.
223///
224/// Searches forward from `pos` to find the beginning of the next word.
225/// Skips over the current word and any whitespace.
226///
227/// # Arguments
228///
229/// * `chars` - The character slice to search
230/// * `pos` - The starting position
231/// * `word_type` - Whether to use small or big word semantics
232///
233/// # Returns
234///
235/// The index of the first character of the next word, or `chars.len()`
236/// if no next word exists.
237#[must_use]
238#[cfg_attr(coverage_nightly, coverage(off))]
239pub fn next_word_start(chars: &[char], pos: usize, word_type: WordType) -> usize {
240 if chars.is_empty() {
241 return 0;
242 }
243
244 let mut idx = pos.min(chars.len() - 1);
245 let current_kind = char_kind(chars[idx]);
246
247 // Skip current word (or whitespace)
248 match word_type {
249 WordType::Small => {
250 // Skip same-kind characters
251 while idx < chars.len() && char_kind(chars[idx]) == current_kind {
252 idx += 1;
253 }
254 }
255 WordType::Big => {
256 // Skip non-whitespace
257 if current_kind != CharKind::Whitespace {
258 while idx < chars.len() && !chars[idx].is_whitespace() {
259 idx += 1;
260 }
261 }
262 }
263 }
264
265 // Skip whitespace
266 while idx < chars.len() && chars[idx].is_whitespace() {
267 idx += 1;
268 }
269
270 idx
271}
272
273/// Find the end of the next word.
274///
275/// Searches forward from `pos` to find the end of the next word.
276/// If already at a word end, moves to the end of the following word.
277///
278/// # Arguments
279///
280/// * `chars` - The character slice to search
281/// * `pos` - The starting position
282/// * `word_type` - Whether to use small or big word semantics
283///
284/// # Returns
285///
286/// The index of the last character of the next word.
287#[must_use]
288#[cfg_attr(coverage_nightly, coverage(off))]
289pub fn next_word_end(chars: &[char], pos: usize, word_type: WordType) -> usize {
290 if chars.is_empty() {
291 return 0;
292 }
293
294 let mut idx = pos.min(chars.len() - 1);
295
296 // If not at end of current word, go to end of current word
297 if idx < chars.len() - 1 {
298 let current_kind = char_kind(chars[idx]);
299 let next_kind = char_kind(chars[idx + 1]);
300
301 let same_word = match word_type {
302 WordType::Small => current_kind == next_kind && current_kind != CharKind::Whitespace,
303 WordType::Big => {
304 current_kind != CharKind::Whitespace && next_kind != CharKind::Whitespace
305 }
306 };
307
308 if same_word {
309 // Move to end of current word
310 return word_end(chars, idx, word_type);
311 }
312 }
313
314 // Move to start of next word, then find its end
315 idx += 1;
316 while idx < chars.len() && chars[idx].is_whitespace() {
317 idx += 1;
318 }
319
320 if idx >= chars.len() {
321 return chars.len().saturating_sub(1);
322 }
323
324 word_end(chars, idx, word_type)
325}