regex_chunker/
adapter.rs

1/*!
2The trait used for types that transform the output of a Chunker.
3*/
4use crate::{
5    ctrl::Utf8FailureMode,
6    RcErr,
7};
8
9/**
10Trait used to implement a [`CustomChunker`](crate::CustomChunker) by
11transforming the output of a [`ByteChunker`](crate::ByteChunker).
12
13This is more powerful than simply calling 
14[`.map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map),
15[`.map_while()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map_while),
16or [`.filter_map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.filter_map)
17on a `ByteChunker` because the type implementing `Adapter` can be _stateful_.
18
19The example below shows a struct implementing `Adapter` to count the number of
20chunks returned so far.
21
22```rust
23use regex_chunker::{Adapter, ByteChunker, RcErr};
24use std::io::Cursor;
25
26struct ChunkCounter {
27    lines: usize,
28}
29
30impl Adapter for ChunkCounter {
31    type Item = Result<Vec<u8>, RcErr>;
32
33    fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item> {
34        match v {
35            Some(Ok(v)) => {
36                self.lines += 1;
37                Some(Ok(v))
38            },
39            x => x,
40        }
41    }
42}
43
44let text =
45br#"What's he that wishes so?
46My cousin Westmoreland? No, my fair cousin:
47If we are mark'd to die, we are enow
48To do our country loss; and if to live,
49The fewer men, the greater share of honour."#;
50
51let c = Cursor::new(text);
52
53let mut chunker = ByteChunker::new(c, r#"\r?\n"#)?
54    .with_adapter(ChunkCounter { lines: 0 });
55
56let _: Vec<String> = (&mut chunker).map(|res| {
57    let v: Vec<u8> = res.unwrap();
58    String::from_utf8(v).unwrap()
59}).collect();
60
61// Prints "5".
62println!("{}", &chunker.get_adapter().lines);
63# Ok::<(), RcErr>(())
64```
65
66*/
67pub trait Adapter {
68    /// The type into which it transforms the values returned by the
69    /// [`ByteChunker`](crate::ByteChunker)'s `Iterator` implementation.
70    type Item;
71
72    /// Convert the `ByteChunker`'s output.
73    fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item>;
74}
75
76/**
77Simpler, less flexible, version of the [`Adapter`] trait.
78
79Can be used in situations where it suffices to just pass `None` and `Err()`
80values through and only operate when the inner
81[`ByteChunker`](crate::ByteChunker)'s `.next()` returns `Some(Ok(vec))`.
82
83This is less powerful than just using
84[`.map()`](https://doc.rust-lang.org/std/iter/trait.Iterator.html#method.map),
85_et. al._, but simpler because there's no error handling required by
86the custom type.
87
88The [`StringAdapter`] type tracks error status, but we can implement a
89simpler type that just performs lossy UTF-8 conversion.
90
91```rust
92# use regex_chunker::RcErr;
93use regex_chunker::{ByteChunker, SimpleAdapter};
94use std::io::Cursor;
95
96struct LossyStringAdapter {}
97
98impl SimpleAdapter for LossyStringAdapter {
99    type Item = String;
100
101    fn adapt(&mut self, v: Vec<u8>) -> Self::Item {
102        String::from_utf8_lossy(&v).into()
103    }
104}
105
106let text = b"One, two, three four. Can I have a little more?";
107let c = Cursor::new(text);
108
109let chunks: Vec<_> = ByteChunker::new(c, "[ .,?]+")?
110    .with_simple_adapter(LossyStringAdapter{})
111    .map(|res| res.unwrap())
112    .collect();
113
114assert_eq!(
115    &chunks,
116    &["One", "two", "three", "four", "Can", "I", "have", "a", "little", "more"].clone()
117);
118# Ok::<(), RcErr>(())
119```
120}
121*/
122pub trait SimpleAdapter {
123    /// The type into which it converts the `Vec<u8>`s successfully produced
124    /// by the underlying [`ByteChunker`](crate::ByteChunker)'s  `Iterator`
125    /// implementation.
126    type Item;
127
128    /// Convert the `ByteChunker`'s output when _successful_.
129    fn adapt(&mut self, v: Vec<u8>) -> Self::Item;
130}
131
132#[derive(Clone, Copy, Debug, Default, PartialEq)]
133enum Utf8ErrorStatus {
134    #[default]
135    Ok,
136    Errored,
137    Lossy,
138    Continue,
139}
140impl Eq for Utf8ErrorStatus {}
141
142/**
143An example [`Adapter`] type for producing a chunker that yields `String`s.
144
145```rust
146# use std::error::Error;
147# fn main() -> Result<(), Box<dyn Error>> {
148    use regex_chunker::{ByteChunker, StringAdapter};
149    use std::io::Cursor;
150
151    let text = b"One, two, three four. Can I have a little more?";
152    let c = Cursor::new(text);
153
154    let chunks: Vec<_> = ByteChunker::new(c, "[ .,?]+")?
155        .with_adapter(StringAdapter::default())
156        .map(|res| res.unwrap())
157        .collect();
158
159    assert_eq!(
160        &chunks,
161        &[
162            "One", "two", "three", "four",
163            "Can", "I", "have", "a", "little", "more"
164        ].clone()
165    );
166#   Ok(()) }
167```
168
169*/
170#[derive(Debug, Default)]
171pub struct StringAdapter {
172    status: Utf8ErrorStatus,
173}
174
175impl StringAdapter {
176    pub fn new(mode: Utf8FailureMode) -> Self {
177        let status = match mode {
178            Utf8FailureMode::Fatal => Utf8ErrorStatus::Ok,
179            Utf8FailureMode::Lossy => Utf8ErrorStatus::Lossy,
180            Utf8FailureMode::Continue => Utf8ErrorStatus::Continue,
181        };
182
183        Self { status }
184    }
185}
186
187impl Adapter for StringAdapter {
188    type Item = Result<String, RcErr>;
189
190    fn adapt(&mut self, v: Option<Result<Vec<u8>, RcErr>>) -> Option<Self::Item> {
191        match (self.status, v) {
192            (Utf8ErrorStatus::Errored, _) => None,
193            (_, None) => None,
194            (_, Some(Err(e))) => Some(Err(e)),
195            (Utf8ErrorStatus::Lossy, Some(Ok(v))) =>
196                Some(Ok(String::from_utf8_lossy(&v).into())),
197            (Utf8ErrorStatus::Ok, Some(Ok(v))) => match String::from_utf8(v) {
198                Ok(s) => Some(Ok(s)),
199                Err(e) => {
200                    self.status = Utf8ErrorStatus::Errored;
201                    Some(Err(e.into()))
202                },
203            },
204            (Utf8ErrorStatus::Continue, Some(Ok(v))) => match String::from_utf8(v) {
205                Ok(s) => Some(Ok(s)),
206                Err(e) => Some(Err(e.into())),
207            }
208        }
209    }
210}