regex_chunker/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2
3/*!
4The centerpiece of this crate is the [`ByteChunker`], which takes a regular
5expression and wraps a [`Read`](std::io::Read) type, becoming an iterator
6over the bytes read from the wrapped type, yielding chunks delimited by
7the supplied regular expression.
8
9The example program below uses a `ByteChunker` to do a crude word
10tally on text coming in on the standard input.
11
12```rust
13use std::{collections::BTreeMap, error::Error};
14use regex_chunker::ByteChunker;
15  
16fn main() -> Result<(), Box<dyn Error>> {
17    let mut counts: BTreeMap<String, usize> = BTreeMap::new();
18    let stdin = std::io::stdin();
19    
20    // The regex is a stab at something matching strings of
21    // "between-word" characters in general English text.
22    let chunker = ByteChunker::new(stdin, r#"[ "\r\n.,!?:;/]+"#)?;
23    for chunk in chunker {
24        let word = String::from_utf8_lossy(&chunk?).to_lowercase();
25        *counts.entry(word).or_default() += 1;
26    }
27
28    println!("{:#?}", &counts);
29    Ok(())
30}
31```
32
33Enabling the `async` feature also exposes the [`stream`] module, which
34features an async version of the `ByteChunker`, wrapping an
35[`AsyncRead`](https://docs.rs/tokio/latest/tokio/io/trait.AsyncRead.html)
36and implementing
37[`Stream`](https://docs.rs/futures-core/0.3.28/futures_core/stream/trait.Stream.html).
38
39(This also pulls in several crates of
40[`tokio`](https://docs.rs/tokio/latest/tokio/index.html) machinery, which is why
41it's behind a feature flag.)
42*/
43
44pub(crate) mod adapter;
45pub use adapter::*;
46mod base;
47pub use base::*;
48pub(crate) mod ctrl;
49pub use ctrl::*;
50mod custom;
51pub use custom::*;
52mod err;
53pub use err::RcErr;
54#[cfg(any(feature = "async", docsrs))]
55#[cfg_attr(docsrs, doc(cfg(feature = "async")))]
56pub mod stream;
57
58#[cfg(test)]
59pub(crate) mod tests {
60    use super::*;
61
62    use std::{
63        fmt::Debug,
64        fs::File,
65        io::{Cursor, Read, Write},
66    };
67
68    use regex::bytes::Regex;
69
70    pub static TEST_PATH: &str = "test/cessen_issue.txt";
71    pub static TEST_PATT: &str = r#"[A-Z]"#;
72    pub static PASSWD_PATH: &str = "test/passwd.txt";
73    pub static PASSWD_PATT: &str = r#"[:\r\n]+"#;
74    pub static HTTP_URL: &str = "https://www.zombo.com";
75    pub static HTTP_PATT: &str = r#">[^<]*"#;
76
77    pub fn chunk_vec<'a>(re: &Regex, v: &'a [u8], mode: MatchDisposition) -> Vec<&'a [u8]> {
78        let mut u: Vec<&[u8]> = Vec::new();
79        let mut offs: usize = 0;
80        let mut prev_offs: usize = 0;
81        while let Some(m) = re.find_at(v, offs) {
82            let (start, end) = match mode {
83                MatchDisposition::Drop => {
84                    let start = offs;
85                    offs = m.end();
86                    (start, m.start())
87                }
88                MatchDisposition::Append => {
89                    let start = offs;
90                    offs = m.end();
91                    (start, m.end())
92                }
93                MatchDisposition::Prepend => {
94                    let start = prev_offs;
95                    offs = m.end();
96                    prev_offs = m.start();
97                    (start, m.start())
98                }
99            };
100
101            u.push(&v[start..end]);
102        }
103
104        match mode {
105            MatchDisposition::Drop | MatchDisposition::Append => {
106                u.push(&v[offs..]);
107            }
108            MatchDisposition::Prepend => {
109                u.push(&v[prev_offs..]);
110            }
111        }
112
113        u
114    }
115
116    pub fn ref_slice_cmp<R, S>(a: &[R], b: &[S])
117    where
118        R: AsRef<[u8]> + Debug,
119        S: AsRef<[u8]> + Debug,
120    {
121        for (aref, bref) in a.iter().zip(b.iter()) {
122            assert_eq!(
123                aref.as_ref(),
124                bref.as_ref(),
125                "left: {:?}\nright: {:?}\n",
126                &String::from_utf8_lossy(aref.as_ref()),
127                &String::from_utf8_lossy(bref.as_ref())
128            );
129        }
130    }
131
132    #[test]
133    fn basic_bytes() {
134        let byte_vec = std::fs::read(TEST_PATH).unwrap();
135        let re = Regex::new(TEST_PATT).unwrap();
136        let slice_vec = chunk_vec(&re, &byte_vec, MatchDisposition::Drop);
137
138        let f = File::open(TEST_PATH).unwrap();
139        let chunker = ByteChunker::new(f, TEST_PATT).unwrap();
140        let vec_vec: Vec<Vec<u8>> = chunker.map(|res| res.unwrap()).collect();
141
142        ref_slice_cmp(&vec_vec, &slice_vec);
143    }
144
145    #[test]
146    fn bytes_append_prepend() {
147        let byte_vec = std::fs::read(PASSWD_PATH).unwrap();
148        let re = Regex::new(PASSWD_PATT).unwrap();
149        let slice_vec = chunk_vec(&re, &byte_vec, MatchDisposition::Append);
150
151        let vec_vec: Vec<Vec<u8>> = ByteChunker::new(File::open(PASSWD_PATH).unwrap(), PASSWD_PATT)
152            .unwrap()
153            .with_match(MatchDisposition::Append)
154            .map(|res| res.unwrap())
155            .collect();
156
157        ref_slice_cmp(&vec_vec, &slice_vec);
158
159        let slice_vec = chunk_vec(&re, &byte_vec, MatchDisposition::Prepend);
160
161        let vec_vec: Vec<Vec<u8>> = ByteChunker::new(File::open(PASSWD_PATH).unwrap(), PASSWD_PATT)
162            .unwrap()
163            .with_match(MatchDisposition::Prepend)
164            .map(|res| res.unwrap())
165            .collect();
166
167        ref_slice_cmp(&vec_vec, &slice_vec);
168    }
169
170    #[test]
171    fn bytes_http_request() {
172        use reqwest::blocking::Client;
173
174        let re = Regex::new(HTTP_PATT).unwrap();
175        let client = Client::new();
176        let re_response = client.get(HTTP_URL).send().unwrap().bytes().unwrap();
177        let slice_vec = chunk_vec(&re, &re_response, MatchDisposition::Drop);
178
179        let ch_response = client.get(HTTP_URL).send().unwrap();
180        let chunker = ByteChunker::new(ch_response, HTTP_PATT).unwrap();
181        let vec_vec: Vec<Vec<u8>> = chunker.map(|res| res.unwrap()).collect();
182
183        ref_slice_cmp(&vec_vec, &slice_vec);
184    }
185
186    #[cfg(unix)]
187    #[test]
188    fn random_bytes() {
189        let re_text = r#"[0-9]"#;
190        let source_path = "/dev/urandom";
191        const N_BYTES: usize = 1024 * 1024;
192        let file_path = "test/random.dat";
193
194        let byte_vec = {
195            let mut source = File::open(source_path).unwrap();
196            let mut buff: Vec<u8> = vec![0; N_BYTES];
197            source.read_exact(&mut buff).unwrap();
198            let mut dest = File::create(file_path).unwrap();
199            dest.write_all(&buff).unwrap();
200            dest.flush().unwrap();
201            buff
202        };
203
204        let re = Regex::new(re_text).unwrap();
205        let slice_vec = chunk_vec(&re, &byte_vec, MatchDisposition::Drop);
206
207        let f = File::open(file_path).unwrap();
208        let chunker = ByteChunker::new(f, re_text).unwrap();
209        let vec_vec: Vec<Vec<u8>> = chunker.map(|res| res.unwrap()).collect();
210
211        ref_slice_cmp(&vec_vec, &slice_vec);
212    }
213
214    #[test]
215    fn basic_string() {
216        let byte_vec = std::fs::read(TEST_PATH).unwrap();
217        let re = Regex::new(TEST_PATT).unwrap();
218        let slice_vec = chunk_vec(&re, &byte_vec, MatchDisposition::Drop);
219
220        let f = File::open(TEST_PATH).unwrap();
221        let chunker = ByteChunker::new(f, TEST_PATT)
222            .unwrap()
223            .with_adapter(StringAdapter::default());
224        let vec_vec: Vec<String> = chunker.map(|res| res.unwrap()).collect();
225
226        ref_slice_cmp(&vec_vec, &slice_vec);
227    }
228
229    #[test]
230    fn string_utf8_error() {
231        let bytes: &[u8] = &[130, 15];
232        let mut chunker = ByteChunker::new(Cursor::new(bytes), TEST_PATT)
233            .unwrap()
234            .with_adapter(StringAdapter::default());
235        assert!(matches!(chunker.next(), Some(Err(RcErr::Utf8(_)))));
236
237        let bytes = b"test one two";
238        let mut chunker = ByteChunker::new(Cursor::new(bytes), TEST_PATT)
239            .unwrap()
240            .with_adapter(StringAdapter::default());
241        assert!(matches!(chunker.next(), Some(Ok(_))));
242    }
243}