yaml_split/lib.rs
1use std::io::{BufRead, BufReader, Read};
2use thiserror::Error;
3
4#[derive(Error, Debug)]
5pub enum YamlSplitError {
6 #[error(transparent)]
7 IOError(#[from] std::io::Error),
8}
9
10/// `DocumentIterator` is an iterator over individual YAML documents in a file or stream.
11///
12/// For example, the following YAML file contains two separate documents:
13/// ```yaml
14/// hello: world
15/// ---
16/// hello: rust
17/// ```
18///
19/// The first item in this iterator will be:
20/// ```yaml
21/// hello: world
22/// ```
23///
24/// The second item will be (the "header" / directives end marker "---" is considered part of the document):
25/// ```yaml
26/// ---
27/// hello: rust
28/// ```
29///
30/// Each item's output will be suitable for passing to `serde-yaml`, `yaml-rust` or
31/// similar libraries for parsing. Each item can also be an error, letting you opt for
32/// safe handling of errors when dealing with lots of files.
33///
34/// ```
35/// use std::fs::File;
36/// # use std::fs::remove_file;
37/// use yaml_split::DocumentIterator;
38/// # use std::io::Write;
39/// let mut file = File::options()
40/// .create(true)
41/// .write(true)
42/// .open("test.yaml")
43/// .unwrap()
44/// .write(b"hello: world");
45///
46/// let read_file = File::open("test.yaml").unwrap();
47/// let doc_iter = DocumentIterator::new(read_file);
48///
49/// for doc in doc_iter {
50/// println!("{}", doc.unwrap());
51/// }
52///
53/// # remove_file("test.yaml").unwrap();
54/// ```
55///
56/// This also correctly handles less common areas of the YAML spec including
57/// directives, comments and document end markers.
58///
59/// ```
60/// use yaml_split::DocumentIterator;
61///
62/// let input = r#"
63///
64/// ## a header comment
65/// %YAML 1.2
66/// ---
67/// hello: world
68/// ...
69/// ---
70/// hello: rust
71/// ---
72/// ## a body comment
73/// hello: everyone
74/// "#;
75///
76/// let mut doc_iter = DocumentIterator::new(input.as_bytes());
77///
78/// assert_eq!(r#"
79///
80/// ## a header comment
81/// %YAML 1.2
82/// ---
83/// hello: world
84/// "#, doc_iter.next().unwrap().unwrap());
85/// assert_eq!(r#"---
86/// hello: rust
87/// "#, doc_iter.next().unwrap().unwrap());
88/// assert_eq!(r#"---
89/// ## a body comment
90/// hello: everyone
91/// "#, doc_iter.next().unwrap().unwrap());
92/// ```
93pub struct DocumentIterator<R>
94where
95 R: Read,
96{
97 reader: BufReader<R>,
98 disambiguated: bool,
99 in_header: bool,
100 prepend_next: Option<String>,
101}
102
103impl<'a, R: Read + 'a> DocumentIterator<R> {
104 /// `new()` creates a new DocumentIterator over a given `reader`'s contents.
105 ///
106 /// This reader can be a reader for a file:
107 /// ```
108 /// use std::fs::File;
109 /// # use std::fs::remove_file;
110 /// use yaml_split::DocumentIterator;
111 /// # use std::io::Write;
112 /// let mut file = File::options()
113 /// .create(true)
114 /// .write(true)
115 /// .open("test.yml")
116 /// .unwrap()
117 /// .write(b"hello: world");
118 ///
119 /// let read_file = File::open("test.yml").unwrap();
120 /// let doc_iter = DocumentIterator::new(read_file);
121 ///
122 /// for doc in doc_iter {
123 /// println!("{}", doc.unwrap());
124 /// }
125 /// # remove_file("test.yml").unwrap();
126 /// ```
127 ///
128 /// Or the reader can be a simple byte array (useful for strings):
129 /// ```
130 /// use yaml_split::DocumentIterator;
131 /// let yaml = r#"
132 /// hello: world
133 /// ---
134 /// hello: rust
135 /// "#;
136 ///
137 /// let mut doc_iter = DocumentIterator::new(yaml.as_bytes());
138 ///
139 /// assert_eq!(r#"
140 /// hello: world
141 /// "#, doc_iter.next().unwrap().unwrap());
142 /// assert_eq!(r#"---
143 /// hello: rust
144 /// "#, doc_iter.next().unwrap().unwrap());
145 /// assert_eq!(true, doc_iter.next().is_none());
146 ///
147 /// // or in a loop:
148 /// for doc in doc_iter {
149 /// println!("{}", doc.unwrap());
150 /// }
151 /// ```
152 pub fn new(reader: R) -> DocumentIterator<R> {
153 let br = BufReader::new(reader);
154
155 DocumentIterator {
156 reader: br,
157 disambiguated: false,
158 in_header: false,
159 prepend_next: None,
160 }
161 }
162}
163
164impl<R: Read> Iterator for DocumentIterator<R> {
165 type Item = Result<String, YamlSplitError>;
166
167 fn next(&mut self) -> Option<Self::Item> {
168 let mut buf = String::new();
169 let mut current_file = match &self.prepend_next {
170 Some(next) => next.clone(),
171 None => String::new(),
172 };
173 self.prepend_next = None;
174
175 // First, we must disambiguate between a bare document and a directive at the top of the
176 // file (before any directive end "---" markers). To do this, we must look for a #, % or
177 // other non-whitespace character as the first character on a line:
178 //
179 // - # indicates a comment, the line will be ignored
180 // - % indicates a directive, we should assume the rest of the header is also a directive as
181 // % is not a valid character at the start of a line, before a --- is seen.
182 // - anything else indicates we must currently be looking at a bare document's content
183 //
184 // XXX: This loop also builds up buffers that are shared with the next loop. The reader
185 // is also shared and so the next loop will start off where this one ends and assume the
186 // buffers have the correct content.
187 loop {
188 if self.disambiguated {
189 break;
190 }
191
192 // Empty the buffer. read_line appends, and we don't want that.
193 buf.clear();
194
195 match self.reader.read_line(&mut buf) {
196 Ok(l) => {
197 if l == 0 {
198 // We hit EOF already, and it's still not clear
199 // this file must have only whitespace, comments or be completely empty.
200 return None;
201 }
202
203 for c in buf.chars() {
204 match c {
205 // Spaces, tabs and carriage returns don't tell us anything,
206 // keep searching the line.
207 ' ' | '\t' | '\r' => continue,
208 // # means this line is a comment, nothing to do.
209 // \n is a newline, also nothing to do, this line didn't
210 // tell us anything.
211 '#' | '\n' => break,
212 // % means this line is a directive, we must be in a header
213 '%' => {
214 self.disambiguated = true;
215 self.in_header = true;
216 break;
217 }
218 // anything else must mean we are in a bare document
219 _ => {
220 self.disambiguated = true;
221 self.in_header = false;
222 break;
223 }
224 };
225 }
226
227 // Append the current line to the document
228 current_file = current_file + &buf;
229 }
230 Err(e) => {
231 return Some(Err(e.into()));
232 }
233 }
234 }
235
236 // Now that we know whether we are starting off in a directive or a document, we can
237 // parse the rest of the YAML. In this loop we will look for the start and end of documents
238 // as our YAML parser does not support parsing multiple documents at once.
239 loop {
240 buf.clear();
241
242 match self.reader.read_line(&mut buf) {
243 Ok(l) => {
244 let hit_eof = l == 0;
245 let cf_len = current_file.len();
246
247 // If there is absolutely nothing to do (i.e. the current file data is empty, and
248 // we're at EOF), just exit the loop.
249 if hit_eof && cf_len == 0 {
250 return None;
251 }
252
253 let end_of_doc = buf.starts_with("...");
254 let directives_end = buf.starts_with("---");
255
256 if !self.in_header && directives_end {
257 // a new document has started already.
258 self.in_header = false;
259 // to not lose the current line, including any directives that might
260 // be on the line (after the "---"), we need to prepend it
261 // the next time someone calls next()
262 self.prepend_next = Some(buf);
263 return Some(Ok(current_file));
264 } else if end_of_doc {
265 // this document has ended, but we don't need this line.
266 // the next line must be a header, or "---"
267 self.in_header = true;
268 return Some(Ok(current_file));
269 } else if hit_eof {
270 // this document has ended, and nothing will follow.
271 return Some(Ok(current_file));
272 } else if self.in_header && directives_end {
273 self.in_header = false;
274 }
275
276 current_file = current_file + &buf;
277 }
278 Err(e) => {
279 return Some(Err(e.into()));
280 }
281 };
282 }
283 }
284}
285
286#[cfg(test)]
287mod tests {
288 use crate::DocumentIterator;
289 use std::io::BufReader;
290
291 fn str_reader(s: &[u8]) -> BufReader<&[u8]> {
292 BufReader::new(s)
293 }
294
295 #[test]
296 fn bare_document() {
297 let input = "abc: def";
298
299 let reader = str_reader(input.as_bytes());
300 let mut doc_iter = DocumentIterator::new(reader);
301
302 let next = doc_iter.next().unwrap().unwrap();
303 assert_eq!(&next, "abc: def");
304
305 let fin = doc_iter.next().is_none();
306 assert_eq!(true, fin);
307 }
308
309 #[test]
310 fn document_with_header() {
311 let input = r#"
312---
313abc: def
314"#;
315
316 let reader = str_reader(input.as_bytes());
317 let mut doc_iter = DocumentIterator::new(reader);
318
319 let next = doc_iter.next().unwrap().unwrap();
320 assert_eq!(
321 &next,
322 r#"
323---
324abc: def
325"#
326 );
327
328 let fin = doc_iter.next().is_none();
329 assert_eq!(true, fin);
330 }
331
332 #[test]
333 fn document_with_header_and_directive() {
334 let input = r#"
335%YAML 1.2
336---
337abc: def
338"#;
339
340 let reader = str_reader(input.as_bytes());
341 let mut doc_iter = DocumentIterator::new(reader);
342
343 let next = doc_iter.next().unwrap().unwrap();
344 assert_eq!(
345 &next,
346 r#"
347%YAML 1.2
348---
349abc: def
350"#
351 );
352
353 let fin = doc_iter.next().is_none();
354 assert_eq!(true, fin);
355 }
356
357 #[test]
358 fn two_documents() {
359 let input = r#"abc: def
360---
361aaa: bbb
362"#;
363
364 let reader = str_reader(input.as_bytes());
365 let mut doc_iter = DocumentIterator::new(reader);
366
367 let mut next = doc_iter.next().unwrap().unwrap();
368 assert_eq!(&next, "abc: def\n");
369
370 next = doc_iter.next().unwrap().unwrap();
371 assert_eq!(
372 &next,
373 r#"---
374aaa: bbb
375"#
376 );
377
378 let fin = doc_iter.next().is_none();
379 assert_eq!(true, fin);
380 }
381
382 #[test]
383 fn two_documents_with_headers() {
384 let input = r#"%YAML 1.2
385---
386abc: def
387...
388
389%YAML 1.2
390---
391aaa: bbb
392"#;
393
394 let reader = str_reader(input.as_bytes());
395 let mut doc_iter = DocumentIterator::new(reader);
396
397 let mut next = doc_iter.next().unwrap().unwrap();
398 assert_eq!(
399 &next,
400 r#"%YAML 1.2
401---
402abc: def
403"#
404 );
405
406 next = doc_iter.next().unwrap().unwrap();
407 assert_eq!(
408 &next,
409 r#"
410%YAML 1.2
411---
412aaa: bbb
413"#
414 );
415
416 let fin = doc_iter.next().is_none();
417 assert_eq!(true, fin);
418 }
419
420 #[test]
421 fn document_medley() {
422 let input = r#"%YAML 1.2
423---
424abc: def
425---
426%YAML: "not a real directive"
427---
428aaa: bbb
429...
430---
431...
432---
433final: "document"
434"#;
435
436 let reader = str_reader(input.as_bytes());
437 let mut doc_iter = DocumentIterator::new(reader);
438
439 let mut next = doc_iter.next().unwrap().unwrap();
440 assert_eq!(
441 &next,
442 r#"%YAML 1.2
443---
444abc: def
445"#
446 );
447
448 next = doc_iter.next().unwrap().unwrap();
449 assert_eq!(
450 &next,
451 r#"---
452%YAML: "not a real directive"
453"#
454 );
455 next = doc_iter.next().unwrap().unwrap();
456 assert_eq!(
457 &next,
458 r#"---
459aaa: bbb
460"#
461 );
462
463 next = doc_iter.next().unwrap().unwrap();
464 assert_eq!(
465 &next,
466 r#"---
467"#
468 );
469
470 next = doc_iter.next().unwrap().unwrap();
471 assert_eq!(
472 &next,
473 r#"---
474final: "document"
475"#
476 );
477
478 let fin = doc_iter.next().is_none();
479 assert_eq!(true, fin);
480 }
481}