extract_words/
lib.rs

1//! # extract-words
2//!
3//! Extracts words from text without allocation
4//!
5//! ## Examples
6//!
7//! Iteration through words, discarding punctuation
8//! ```
9//! # use extract_words::extract_words;
10//! let mut words = extract_words("¿Cómo estás?");
11//! assert_eq!(words.next().unwrap(), "Cómo");
12//! assert_eq!(words.next().unwrap(), "estás");
13//! assert!(words.next().is_none());
14//! ```
15//!
16//! Iteration through all entries
17//! ```
18//! # use extract_words::{Entries, Entry};
19//! let mut entries = Entries::new("Bien :)");
20//! assert_eq!(entries.next().unwrap(), Entry::Word("Bien"));
21//! assert_eq!(entries.next().unwrap(), Entry::Other(" :)"));
22//! assert!(entries.next().is_none());
23//! ```
24
25#![warn(clippy::all, missing_docs, nonstandard_style, future_incompatible)]
26
27/// Extracts words from the text discarding punctuation
28pub fn extract_words(text: &str) -> impl Iterator<Item = &str> {
29    Entries::new(text).filter_map(|e| match e {
30        Entry::Word(s) => Some(s),
31        Entry::Other(_) => None,
32    })
33}
34
35/// An iterator over text entries
36pub struct Entries<'a> {
37    text: &'a str,
38    char_indices: std::str::CharIndices<'a>,
39    cur_entry: CurEntry,
40}
41
42/// Text entry
43#[derive(Debug, PartialEq)]
44pub enum Entry<'a> {
45    /// Punctuation, spaces, etc
46    Other(&'a str),
47    /// Word
48    Word(&'a str),
49}
50
51enum CurEntry {
52    None,
53    Other(usize),
54    Word(usize),
55}
56
57impl<'a> Entries<'a> {
58    /// Creates an iterator over the text entries
59    pub fn new(text: &'a str) -> Self {
60        Entries {
61            text,
62            char_indices: text.char_indices(),
63            cur_entry: CurEntry::None,
64        }
65    }
66}
67
68impl<'a> Iterator for Entries<'a> {
69    type Item = Entry<'a>;
70
71    fn next(&mut self) -> Option<Self::Item> {
72        for (i, c) in self.char_indices.by_ref() {
73            if c.is_alphanumeric() {
74                match self.cur_entry {
75                    CurEntry::None => self.cur_entry = CurEntry::Word(i),
76                    CurEntry::Other(start) => {
77                        self.cur_entry = CurEntry::Word(i);
78                        return Some(Entry::Other(&self.text[start..i]));
79                    }
80                    CurEntry::Word(_) => (),
81                }
82            } else {
83                match self.cur_entry {
84                    CurEntry::None => self.cur_entry = CurEntry::Other(i),
85                    CurEntry::Other(_) => (),
86                    CurEntry::Word(start) => {
87                        self.cur_entry = CurEntry::Other(i);
88                        return Some(Entry::Word(&self.text[start..i]));
89                    }
90                }
91            }
92        }
93
94        match self.cur_entry {
95            CurEntry::None => None,
96            CurEntry::Other(start) => {
97                self.cur_entry = CurEntry::None;
98                if start < self.text.len() {
99                    Some(Entry::Other(&self.text[start..]))
100                } else {
101                    None
102                }
103            }
104            CurEntry::Word(start) => {
105                self.cur_entry = CurEntry::None;
106                if start < self.text.len() {
107                    Some(Entry::Word(&self.text[start..]))
108                } else {
109                    None
110                }
111            }
112        }
113    }
114}
115
116impl<'a> AsRef<str> for Entry<'a> {
117    fn as_ref(&self) -> &str {
118        match self {
119            Entry::Other(s) => s,
120            Entry::Word(s) => s,
121        }
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::extract_words;
128
129    fn extract_vec(text: &str) -> Vec<&str> {
130        extract_words(text).collect()
131    }
132
133    #[test]
134    fn test_empty_string() {
135        assert!(extract_vec("").is_empty());
136    }
137
138    #[test]
139    fn test_punctuation_only() {
140        assert!(extract_vec(".,!?-").is_empty());
141    }
142
143    #[test]
144    fn test_mixed_input() {
145        assert_eq!(
146            extract_vec("Hola,mundo! ¿Cómo estás?"),
147            ["Hola", "mundo", "Cómo", "estás"]
148        );
149    }
150
151    #[test]
152    fn test_multiple_delimiters() {
153        assert_eq!(extract_vec("Hola, mundo!¿ .. !¿á"), ["Hola", "mundo", "á"]);
154    }
155}