regex_chunker/adapter.rs
1/*!
2The trait used for types that transform the output of a Chunker.
3*/
4use crate::{
5 ctrl::Utf8FailureMode,
6 RcErr,
7};
8
9/**
10Trait used to implement a [`CustomChunker`](crate::CustomChunker) by
11transforming the output of a [`ByteChunker`](crate::ByteChunker).
12
13This is more powerful than simply calling
14[`.map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map),
15[`.map_while()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while),
16or [`.filter_map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.filter_map)
17on a `ByteChunker` because the type implementing `Adapter` can be _stateful_.
18
19The example below shows a struct implementing `Adapter` to count the number of
20chunks returned so far.
21
22```rust
23use regex_chunker::{Adapter, ByteChunker, RcErr};
24use std::io::Cursor;
25
26struct ChunkCounter {
27 lines: usize,
28}
29
30impl Adapter for ChunkCounter {
31 type Item = Result<Vec<u8>, RcErr>;
32
33 fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item> {
34 match v {
35 Some(Ok(v)) => {
36 self.lines += 1;
37 Some(Ok(v))
38 },
39 x => x,
40 }
41 }
42}
43
44let text =
45br#"What's he that wishes so?
46My cousin Westmoreland? No, my fair cousin:
47If we are mark'd to die, we are enow
48To do our country loss; and if to live,
49The fewer men, the greater share of honour."#;
50
51let c = Cursor::new(text);
52
53let mut chunker = ByteChunker::new(c, r#"\r?\n"#)?
54 .with_adapter(ChunkCounter { lines: 0 });
55
56let _: Vec<String> = (&mut chunker).map(|res| {
57 let v: Vec<u8> = res.unwrap();
58 String::from_utf8(v).unwrap()
59}).collect();
60
61// Prints "5".
62println!("{}", &chunker.get_adapter().lines);
63# Ok::<(), RcErr>(())
64```
65
66*/
67pub trait Adapter {
68 /// The type into which it transforms the values returned by the
69 /// [`ByteChunker`](crate::ByteChunker)'s `Iterator` implementation.
70 type Item;
71
72 /// Convert the `ByteChunker`'s output.
73 fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item>;
74}
75
76/**
77Simpler, less flexible, version of the [`Adapter`] trait.
78
79Can be used in situations where it suffices to just pass `None` and `Err()`
80values through and only operate when the inner
81[`ByteChunker`](crate::ByteChunker)'s `.next()` returns `Some(Ok(vec))`.
82
83This is less powerful than just using
84[`.map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map),
85_et. al._, but simpler because there's no error handling required by
86the custom type.
87
88The [`StringAdapter`] type tracks error status, but we can implement a
89simpler type that just performs lossy UTF-8 conversion.
90
91```rust
92# use regex_chunker::RcErr;
93use regex_chunker::{ByteChunker, SimpleAdapter};
94use std::io::Cursor;
95
96struct LossyStringAdapter {}
97
98impl SimpleAdapter for LossyStringAdapter {
99 type Item = String;
100
101 fn adapt(&mut self, v: Vec<u8>) -> Self::Item {
102 String::from_utf8_lossy(&v).into()
103 }
104}
105
106let text = b"One, two, three four. Can I have a little more?";
107let c = Cursor::new(text);
108
109let chunks: Vec<_> = ByteChunker::new(c, "[ .,?]+")?
110 .with_simple_adapter(LossyStringAdapter{})
111 .map(|res| res.unwrap())
112 .collect();
113
114assert_eq!(
115 &chunks,
116 &["One", "two", "three", "four", "Can", "I", "have", "a", "little", "more"].clone()
117);
118# Ok::<(), RcErr>(())
119```
120}
121*/
122pub trait SimpleAdapter {
123 /// The type into which it converts the `Vec<u8>`s successfully produced
124 /// by the underlying [`ByteChunker`](crate::ByteChunker)'s `Iterator`
125 /// implementation.
126 type Item;
127
128 /// Convert the `ByteChunker`'s output when _successful_.
129 fn adapt(&mut self, v: Vec<u8>) -> Self::Item;
130}
131
132#[derive(Clone, Copy, Debug, Default, PartialEq)]
133enum Utf8ErrorStatus {
134 #[default]
135 Ok,
136 Errored,
137 Lossy,
138 Continue,
139}
140impl Eq for Utf8ErrorStatus {}
141
142/**
143An example [`Adapter`] type for producing a chunker that yields `String`s.
144
145```rust
146# use std::error::Error;
147# fn main() -> Result<(), Box<dyn Error>> {
148 use regex_chunker::{ByteChunker, StringAdapter};
149 use std::io::Cursor;
150
151 let text = b"One, two, three four. Can I have a little more?";
152 let c = Cursor::new(text);
153
154 let chunks: Vec<_> = ByteChunker::new(c, "[ .,?]+")?
155 .with_adapter(StringAdapter::default())
156 .map(|res| res.unwrap())
157 .collect();
158
159 assert_eq!(
160 &chunks,
161 &[
162 "One", "two", "three", "four",
163 "Can", "I", "have", "a", "little", "more"
164 ].clone()
165 );
166# Ok(()) }
167```
168
169*/
170#[derive(Debug, Default)]
171pub struct StringAdapter {
172 status: Utf8ErrorStatus,
173}
174
175impl StringAdapter {
176 pub fn new(mode: Utf8FailureMode) -> Self {
177 let status = match mode {
178 Utf8FailureMode::Fatal => Utf8ErrorStatus::Ok,
179 Utf8FailureMode::Lossy => Utf8ErrorStatus::Lossy,
180 Utf8FailureMode::Continue => Utf8ErrorStatus::Continue,
181 };
182
183 Self { status }
184 }
185}
186
187impl Adapter for StringAdapter {
188 type Item = Result<String, RcErr>;
189
190 fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item> {
191 match (self.status, v) {
192 (Utf8ErrorStatus::Errored, _) => None,
193 (_, None) => None,
194 (_, Some(Err(e))) => Some(Err(e)),
195 (Utf8ErrorStatus::Lossy, Some(Ok(v))) =>
196 Some(Ok(String::from_utf8_lossy(&v).into())),
197 (Utf8ErrorStatus::Ok, Some(Ok(v))) => match String::from_utf8(v) {
198 Ok(s) => Some(Ok(s)),
199 Err(e) => {
200 self.status = Utf8ErrorStatus::Errored;
201 Some(Err(e.into()))
202 },
203 },
204 (Utf8ErrorStatus::Continue, Some(Ok(v))) => match String::from_utf8(v) {
205 Ok(s) => Some(Ok(s)),
206 Err(e) => Some(Err(e.into())),
207 }
208 }
209 }
210}