#[inline]
pub fn is_word_boundary(text: &[u8], pos: usize) -> bool {
if text.is_empty() {
return true;
}
if pos == 0 || pos >= text.len() {
return true;
}
let prev = text[pos - 1];
let curr = text[pos];
is_word_char(prev) != is_word_char(curr)
}
#[inline]
pub const fn is_word_char(c: u8) -> bool {
matches!(c, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_')
}
#[inline]
pub const fn is_whitespace(c: u8) -> bool {
matches!(c, b' ' | b'\t' | b'\n' | b'\r' | 0x0C | 0x0B)
}
#[inline]
pub const fn is_punctuation(c: u8) -> bool {
matches!(c,
b'!' | b'"' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'(' | b')' |
b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b':' | b';' | b'<' |
b'=' | b'>' | b'?' | b'@' | b'[' | b'\\' | b']' | b'^' | b'`' |
b'{' | b'|' | b'}' | b'~'
)
}
pub fn find_word_boundaries(text: &[u8]) -> Vec<usize> {
if text.is_empty() {
return vec![0];
}
let mut boundaries = Vec::new();
boundaries.push(0);
for i in 1..text.len() {
if is_word_char(text[i - 1]) != is_word_char(text[i]) {
boundaries.push(i);
}
}
boundaries.push(text.len());
boundaries
}
pub struct WordIterator<'a> {
text: &'a [u8],
pos: usize,
}
impl<'a> WordIterator<'a> {
pub fn new(text: &'a [u8]) -> Self {
Self { text, pos: 0 }
}
}
impl<'a> Iterator for WordIterator<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
while self.pos < self.text.len() && !is_word_char(self.text[self.pos]) {
self.pos += 1;
}
if self.pos >= self.text.len() {
return None;
}
let start = self.pos;
while self.pos < self.text.len() && is_word_char(self.text[self.pos]) {
self.pos += 1;
}
Some(&self.text[start..self.pos])
}
}
pub fn words(text: &[u8]) -> WordIterator<'_> {
WordIterator::new(text)
}
pub fn word_count(text: &[u8]) -> usize {
words(text).count()
}
pub fn word_at_position(text: &[u8], pos: usize) -> Option<(usize, usize)> {
if pos >= text.len() || !is_word_char(text[pos]) {
return None;
}
let mut start = pos;
while start > 0 && is_word_char(text[start - 1]) {
start -= 1;
}
let mut end = pos;
while end < text.len() && is_word_char(text[end]) {
end += 1;
}
Some((start, end))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_word_char() {
assert!(is_word_char(b'a'));
assert!(is_word_char(b'z'));
assert!(is_word_char(b'A'));
assert!(is_word_char(b'Z'));
assert!(is_word_char(b'0'));
assert!(is_word_char(b'9'));
assert!(is_word_char(b'_'));
assert!(!is_word_char(b' '));
assert!(!is_word_char(b'-'));
assert!(!is_word_char(b'.'));
assert!(!is_word_char(b','));
assert!(!is_word_char(b'!'));
}
#[test]
fn test_is_whitespace() {
assert!(is_whitespace(b' '));
assert!(is_whitespace(b'\t'));
assert!(is_whitespace(b'\n'));
assert!(is_whitespace(b'\r'));
assert!(!is_whitespace(b'a'));
assert!(!is_whitespace(b'0'));
}
#[test]
fn test_is_punctuation() {
assert!(is_punctuation(b'.'));
assert!(is_punctuation(b','));
assert!(is_punctuation(b'!'));
assert!(is_punctuation(b'?'));
assert!(is_punctuation(b'"'));
assert!(!is_punctuation(b'a'));
assert!(!is_punctuation(b'0'));
assert!(!is_punctuation(b' '));
}
#[test]
fn test_is_word_boundary() {
let text = b"hello world";
assert!(is_word_boundary(text, 0));
assert!(is_word_boundary(text, 11));
assert!(is_word_boundary(text, 5)); assert!(is_word_boundary(text, 6));
assert!(!is_word_boundary(text, 1)); assert!(!is_word_boundary(text, 7)); }
#[test]
fn test_is_word_boundary_empty() {
assert!(is_word_boundary(b"", 0));
}
#[test]
fn test_find_word_boundaries() {
let text = b"hello world";
let boundaries = find_word_boundaries(text);
assert_eq!(boundaries, vec![0, 5, 6, 11]);
}
#[test]
fn test_find_word_boundaries_multiple() {
let text = b"a-b-c";
let boundaries = find_word_boundaries(text);
assert_eq!(boundaries, vec![0, 1, 2, 3, 4, 5]);
}
#[test]
fn test_find_word_boundaries_empty() {
let boundaries = find_word_boundaries(b"");
assert_eq!(boundaries, vec![0]);
}
#[test]
fn test_words_iterator() {
let text = b"hello, world! test_123";
let word_list: Vec<_> = words(text).collect();
assert_eq!(word_list.len(), 3);
assert_eq!(word_list[0], b"hello");
assert_eq!(word_list[1], b"world");
assert_eq!(word_list[2], b"test_123");
}
#[test]
fn test_words_empty() {
let word_list: Vec<_> = words(b"").collect();
assert!(word_list.is_empty());
}
#[test]
fn test_words_only_delimiters() {
let word_list: Vec<_> = words(b" ,,, ").collect();
assert!(word_list.is_empty());
}
#[test]
fn test_word_count() {
assert_eq!(word_count(b"hello world"), 2);
assert_eq!(word_count(b"one-two-three"), 3);
assert_eq!(word_count(b" spaced out "), 2);
assert_eq!(word_count(b""), 0);
assert_eq!(word_count(b"single"), 1);
}
#[test]
fn test_word_at_position() {
let text = b"hello world";
assert_eq!(word_at_position(text, 0), Some((0, 5)));
assert_eq!(word_at_position(text, 2), Some((0, 5)));
assert_eq!(word_at_position(text, 4), Some((0, 5)));
assert_eq!(word_at_position(text, 6), Some((6, 11)));
assert_eq!(word_at_position(text, 8), Some((6, 11)));
assert_eq!(word_at_position(text, 10), Some((6, 11)));
assert_eq!(word_at_position(text, 5), None);
assert_eq!(word_at_position(text, 20), None);
}
#[test]
fn test_word_at_position_underscore() {
let text = b"test_word";
assert_eq!(word_at_position(text, 4), Some((0, 9)));
}
}