yaml_split/
lib.rs

1use std::io::{BufRead, BufReader, Read};
2use thiserror::Error;
3
4#[derive(Error, Debug)]
5pub enum YamlSplitError {
6    #[error(transparent)]
7    IOError(#[from] std::io::Error),
8}
9
10/// `DocumentIterator` is an iterator over individual YAML documents in a file or stream.
11///
12/// For example, the following YAML file contains two separate documents:
13/// ```yaml
14/// hello: world
15/// ---
16/// hello: rust
17/// ```
18///
19/// The first item in this iterator will be:
20/// ```yaml
21/// hello: world
22/// ```
23///
24/// The second item will be (the "header" / directives end marker "---" is considered part of the document):
25/// ```yaml
26/// ---
27/// hello: rust
28/// ```
29///
30/// Each item's output will be suitable for passing to `serde-yaml`, `yaml-rust` or
31/// similar libraries for parsing. Each item can also be an error, letting you opt for
32/// safe handling of errors when dealing with lots of files.
33///
34/// ```
35/// use std::fs::File;
36/// # use std::fs::remove_file;
37/// use yaml_split::DocumentIterator;
38/// # use std::io::Write;
39/// let mut file = File::options()
40///     .create(true)
41///     .write(true)
42///     .open("test.yaml")
43///     .unwrap()
44///     .write(b"hello: world");
45///
46/// let read_file = File::open("test.yaml").unwrap();
47/// let doc_iter = DocumentIterator::new(read_file);
48///
49/// for doc in doc_iter {
50///     println!("{}", doc.unwrap());
51/// }
52///
53/// # remove_file("test.yaml").unwrap();
54/// ```
55///
56/// This also correctly handles less common areas of the YAML spec including
57/// directives, comments and document end markers.
58///
59/// ```
60/// use yaml_split::DocumentIterator;
61///
62/// let input = r#"
63///
64/// ## a header comment
65/// %YAML 1.2
66/// ---
67/// hello: world
68/// ...
69/// ---
70/// hello: rust
71/// ---
72/// ## a body comment
73/// hello: everyone
74/// "#;
75///
76/// let mut doc_iter = DocumentIterator::new(input.as_bytes());
77///
78/// assert_eq!(r#"
79///
80/// ## a header comment
81/// %YAML 1.2
82/// ---
83/// hello: world
84/// "#, doc_iter.next().unwrap().unwrap());
85/// assert_eq!(r#"---
86/// hello: rust
87/// "#, doc_iter.next().unwrap().unwrap());
88/// assert_eq!(r#"---
89/// ## a body comment
90/// hello: everyone
91/// "#, doc_iter.next().unwrap().unwrap());
92/// ```
93pub struct DocumentIterator<R>
94where
95    R: Read,
96{
97    reader: BufReader<R>,
98    disambiguated: bool,
99    in_header: bool,
100    prepend_next: Option<String>,
101}
102
103impl<'a, R: Read + 'a> DocumentIterator<R> {
104    /// `new()` creates a new DocumentIterator over a given `reader`'s contents.
105    ///
106    /// This reader can be a reader for a file:
107    /// ```
108    /// use std::fs::File;
109    /// # use std::fs::remove_file;
110    /// use yaml_split::DocumentIterator;
111    /// # use std::io::Write;
112    /// let mut file = File::options()
113    ///     .create(true)
114    ///     .write(true)
115    ///     .open("test.yml")
116    ///     .unwrap()
117    ///     .write(b"hello: world");
118    ///
119    /// let read_file = File::open("test.yml").unwrap();
120    /// let doc_iter = DocumentIterator::new(read_file);
121    ///
122    /// for doc in doc_iter {
123    ///     println!("{}", doc.unwrap());
124    /// }
125    /// # remove_file("test.yml").unwrap();
126    /// ```
127    ///
128    /// Or the reader can be a simple byte array (useful for strings):
129    /// ```
130    /// use yaml_split::DocumentIterator;
131    /// let yaml = r#"
132    /// hello: world
133    /// ---
134    /// hello: rust
135    /// "#;
136    ///
137    /// let mut doc_iter = DocumentIterator::new(yaml.as_bytes());
138    ///
139    /// assert_eq!(r#"
140    /// hello: world
141    /// "#, doc_iter.next().unwrap().unwrap());
142    /// assert_eq!(r#"---
143    /// hello: rust
144    /// "#, doc_iter.next().unwrap().unwrap());
145    /// assert_eq!(true, doc_iter.next().is_none());
146    ///
147    /// // or in a loop:
148    /// for doc in doc_iter {
149    ///     println!("{}", doc.unwrap());
150    /// }
151    /// ```
152    pub fn new(reader: R) -> DocumentIterator<R> {
153        let br = BufReader::new(reader);
154
155        DocumentIterator {
156            reader: br,
157            disambiguated: false,
158            in_header: false,
159            prepend_next: None,
160        }
161    }
162}
163
164impl<R: Read> Iterator for DocumentIterator<R> {
165    type Item = Result<String, YamlSplitError>;
166
167    fn next(&mut self) -> Option<Self::Item> {
168        let mut buf = String::new();
169        let mut current_file = match &self.prepend_next {
170            Some(next) => next.clone(),
171            None => String::new(),
172        };
173        self.prepend_next = None;
174
175        // First, we must disambiguate between a bare document and a directive at the top of the
176        // file (before any directive end "---" markers). To do this, we must look for a #, % or
177        // other non-whitespace character as the first character on a line:
178        //
179        // - # indicates a comment, the line will be ignored
180        // - % indicates a directive, we should assume the rest of the header is also a directive as
181        //    % is not a valid character at the start of a line, before a --- is seen.
182        // - anything else indicates we must currently be looking at a bare document's content
183        //
184        // XXX: This loop also builds up buffers that are shared with the next loop. The reader
185        // is also shared and so the next loop will start off where this one ends and assume the
186        // buffers have the correct content.
187        loop {
188            if self.disambiguated {
189                break;
190            }
191
192            // Empty the buffer. read_line appends, and we don't want that.
193            buf.clear();
194
195            match self.reader.read_line(&mut buf) {
196                Ok(l) => {
197                    if l == 0 {
198                        // We hit EOF already, and it's still not clear
199                        // this file must have only whitespace, comments or be completely empty.
200                        return None;
201                    }
202
203                    for c in buf.chars() {
204                        match c {
205                            // Spaces, tabs and carriage returns don't tell us anything,
206                            // keep searching the line.
207                            ' ' | '\t' | '\r' => continue,
208                            // # means this line is a comment, nothing to do.
209                            // \n is a newline, also nothing to do, this line didn't
210                            // tell us anything.
211                            '#' | '\n' => break,
212                            // % means this line is a directive, we must be in a header
213                            '%' => {
214                                self.disambiguated = true;
215                                self.in_header = true;
216                                break;
217                            }
218                            // anything else must mean we are in a bare document
219                            _ => {
220                                self.disambiguated = true;
221                                self.in_header = false;
222                                break;
223                            }
224                        };
225                    }
226
227                    // Append the current line to the document
228                    current_file = current_file + &buf;
229                }
230                Err(e) => {
231                    return Some(Err(e.into()));
232                }
233            }
234        }
235
236        // Now that we know whether we are starting off in a directive or a document, we can
237        // parse the rest of the YAML. In this loop we will look for the start and end of documents
238        // as our YAML parser does not support parsing multiple documents at once.
239        loop {
240            buf.clear();
241
242            match self.reader.read_line(&mut buf) {
243                Ok(l) => {
244                    let hit_eof = l == 0;
245                    let cf_len = current_file.len();
246
247                    // If there is absolutely nothing to do (i.e. the current file data is empty, and
248                    // we're at EOF), just exit the loop.
249                    if hit_eof && cf_len == 0 {
250                        return None;
251                    }
252
253                    let end_of_doc = buf.starts_with("...");
254                    let directives_end = buf.starts_with("---");
255
256                    if !self.in_header && directives_end {
257                        // a new document has started already.
258                        self.in_header = false;
259                        // to not lose the current line, including any directives that might
260                        // be on the line (after the "---"), we need to prepend it
261                        // the next time someone calls next()
262                        self.prepend_next = Some(buf);
263                        return Some(Ok(current_file));
264                    } else if end_of_doc {
265                        // this document has ended, but we don't need this line.
266                        // the next line must be a header, or "---"
267                        self.in_header = true;
268                        return Some(Ok(current_file));
269                    } else if hit_eof {
270                        // this document has ended, and nothing will follow.
271                        return Some(Ok(current_file));
272                    } else if self.in_header && directives_end {
273                        self.in_header = false;
274                    }
275
276                    current_file = current_file + &buf;
277                }
278                Err(e) => {
279                    return Some(Err(e.into()));
280                }
281            };
282        }
283    }
284}
285
286#[cfg(test)]
287mod tests {
288    use crate::DocumentIterator;
289    use std::io::BufReader;
290
291    fn str_reader(s: &[u8]) -> BufReader<&[u8]> {
292        BufReader::new(s)
293    }
294
295    #[test]
296    fn bare_document() {
297        let input = "abc: def";
298
299        let reader = str_reader(input.as_bytes());
300        let mut doc_iter = DocumentIterator::new(reader);
301
302        let next = doc_iter.next().unwrap().unwrap();
303        assert_eq!(&next, "abc: def");
304
305        let fin = doc_iter.next().is_none();
306        assert_eq!(true, fin);
307    }
308
309    #[test]
310    fn document_with_header() {
311        let input = r#"
312---
313abc: def
314"#;
315
316        let reader = str_reader(input.as_bytes());
317        let mut doc_iter = DocumentIterator::new(reader);
318
319        let next = doc_iter.next().unwrap().unwrap();
320        assert_eq!(
321            &next,
322            r#"
323---
324abc: def
325"#
326        );
327
328        let fin = doc_iter.next().is_none();
329        assert_eq!(true, fin);
330    }
331
332    #[test]
333    fn document_with_header_and_directive() {
334        let input = r#"
335%YAML 1.2
336---
337abc: def
338"#;
339
340        let reader = str_reader(input.as_bytes());
341        let mut doc_iter = DocumentIterator::new(reader);
342
343        let next = doc_iter.next().unwrap().unwrap();
344        assert_eq!(
345            &next,
346            r#"
347%YAML 1.2
348---
349abc: def
350"#
351        );
352
353        let fin = doc_iter.next().is_none();
354        assert_eq!(true, fin);
355    }
356
357    #[test]
358    fn two_documents() {
359        let input = r#"abc: def
360---
361aaa: bbb
362"#;
363
364        let reader = str_reader(input.as_bytes());
365        let mut doc_iter = DocumentIterator::new(reader);
366
367        let mut next = doc_iter.next().unwrap().unwrap();
368        assert_eq!(&next, "abc: def\n");
369
370        next = doc_iter.next().unwrap().unwrap();
371        assert_eq!(
372            &next,
373            r#"---
374aaa: bbb
375"#
376        );
377
378        let fin = doc_iter.next().is_none();
379        assert_eq!(true, fin);
380    }
381
382    #[test]
383    fn two_documents_with_headers() {
384        let input = r#"%YAML 1.2
385---
386abc: def
387...
388
389%YAML 1.2
390---
391aaa: bbb
392"#;
393
394        let reader = str_reader(input.as_bytes());
395        let mut doc_iter = DocumentIterator::new(reader);
396
397        let mut next = doc_iter.next().unwrap().unwrap();
398        assert_eq!(
399            &next,
400            r#"%YAML 1.2
401---
402abc: def
403"#
404        );
405
406        next = doc_iter.next().unwrap().unwrap();
407        assert_eq!(
408            &next,
409            r#"
410%YAML 1.2
411---
412aaa: bbb
413"#
414        );
415
416        let fin = doc_iter.next().is_none();
417        assert_eq!(true, fin);
418    }
419
420    #[test]
421    fn document_medley() {
422        let input = r#"%YAML 1.2
423---
424abc: def
425---
426%YAML: "not a real directive"
427---
428aaa: bbb
429...
430---
431...
432---
433final: "document"
434"#;
435
436        let reader = str_reader(input.as_bytes());
437        let mut doc_iter = DocumentIterator::new(reader);
438
439        let mut next = doc_iter.next().unwrap().unwrap();
440        assert_eq!(
441            &next,
442            r#"%YAML 1.2
443---
444abc: def
445"#
446        );
447
448        next = doc_iter.next().unwrap().unwrap();
449        assert_eq!(
450            &next,
451            r#"---
452%YAML: "not a real directive"
453"#
454        );
455        next = doc_iter.next().unwrap().unwrap();
456        assert_eq!(
457            &next,
458            r#"---
459aaa: bbb
460"#
461        );
462
463        next = doc_iter.next().unwrap().unwrap();
464        assert_eq!(
465            &next,
466            r#"---
467"#
468        );
469
470        next = doc_iter.next().unwrap().unwrap();
471        assert_eq!(
472            &next,
473            r#"---
474final: "document"
475"#
476        );
477
478        let fin = doc_iter.next().is_none();
479        assert_eq!(true, fin);
480    }
481}