#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TokenSpan {
pub token: String,
pub start: usize,
pub end: usize,
}
impl TokenSpan {
pub fn new(token: impl Into<String>, start: usize, end: usize) -> Self {
Self { token: token.into(), start, end }
}
pub fn len(&self) -> usize {
self.end - self.start
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
pub fn token_spans(text: &str) -> Vec<TokenSpan> {
if text.is_empty() {
return Vec::new();
}
let mut spans = Vec::new();
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
if bytes[i].is_ascii_whitespace() {
i += 1;
continue;
}
let start = i;
while i < len && !bytes[i].is_ascii_whitespace() {
i += 1;
}
let end = i;
let token = &text[start..end];
spans.push(TokenSpan::new(token, start, end));
}
spans
}
pub fn count_tokens(text: &str) -> usize {
token_spans(text).len()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_string_returns_no_spans() {
assert!(token_spans("").is_empty());
}
#[test]
fn single_word_produces_one_span() {
let spans = token_spans("hello");
assert_eq!(spans.len(), 1);
assert_eq!(spans[0].token, "hello");
assert_eq!(spans[0].start, 0);
assert_eq!(spans[0].end, 5);
}
#[test]
fn two_words_produce_two_spans() {
let spans = token_spans("hello world");
assert_eq!(spans.len(), 2);
assert_eq!(spans[0].token, "hello");
assert_eq!(spans[1].token, "world");
}
#[test]
fn leading_whitespace_is_skipped() {
let spans = token_spans(" word");
assert_eq!(spans.len(), 1);
assert_eq!(spans[0].start, 2);
}
#[test]
fn trailing_whitespace_is_ignored() {
let spans = token_spans("word ");
assert_eq!(spans.len(), 1);
assert_eq!(spans[0].end, 4);
}
#[test]
fn multiple_spaces_between_words() {
let spans = token_spans("a b");
assert_eq!(spans.len(), 2);
}
#[test]
fn span_byte_offsets_reconstruct_token() {
let text = "foo bar baz";
let spans = token_spans(text);
for span in &spans {
assert_eq!(&text[span.start..span.end], span.token);
}
}
#[test]
fn punctuation_is_treated_as_part_of_token() {
let spans = token_spans("hello, world!");
assert_eq!(spans.len(), 2);
assert_eq!(spans[0].token, "hello,");
assert_eq!(spans[1].token, "world!");
}
#[test]
fn newlines_and_tabs_split_tokens() {
let spans = token_spans("a\tb\nc");
assert_eq!(spans.len(), 3);
}
#[test]
fn count_tokens_matches_span_count() {
let text = "the quick brown fox";
assert_eq!(count_tokens(text), token_spans(text).len());
assert_eq!(count_tokens(text), 4);
}
#[test]
fn count_tokens_empty_is_zero() {
assert_eq!(count_tokens(""), 0);
}
#[test]
fn span_len_equals_token_byte_length() {
let spans = token_spans("foo");
assert_eq!(spans[0].len(), 3);
}
}