unicode_reader/graphemes.rs
1// Copyright (c) 2016-2019 William R. Fraser
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use unicode_segmentation::UnicodeSegmentation;
10use std::io;
11use std::mem;
12
13/// Wraps a `char`-oriented reader and yields the data one Unicode grapheme cluster at a time.
14pub struct Graphemes<R: Iterator<Item = io::Result<char>>> {
15 input: R,
16 buffer: String,
17 pending_error: Option<io::Error>,
18}
19
20impl<R: Iterator<Item = io::Result<char>>> Iterator for Graphemes<R> {
21 /// The type of the elements being iterated over: a `io::Result` with one Unicode grapheme
22 /// cluster, or any I/O error raised by the underlying reader.
23 type Item = io::Result<String>;
24
25 /// Get the next grapheme cluster from the stream. Note that because grapheme clusters are of
26 /// indeterminate length, this has to read the underlying reader until the *next* cluster
27 /// starts before it can return a grapheme.
28 fn next(&mut self) -> Option<Self::Item> {
29 if let Some(err) = self.pending_error.take() {
30 return Some(Err(err));
31 }
32 loop {
33 match self.input.next() {
34 Some(Ok(codepoint)) => {
35 self.buffer.push(codepoint);
36 }
37 None => {
38 if self.buffer.is_empty() {
39 return None;
40 } else {
41 return Some(Ok(mem::replace(&mut self.buffer, String::new())));
42 }
43 }
44 Some(Err(e)) => {
45 if self.buffer.is_empty() {
46 return Some(Err(e));
47 } else {
48 // If the buffer is non-empty, consider the grapheme done and return it,
49 // but save the error and raise it next time around.
50 self.pending_error = Some(e);
51 return Some(Ok(mem::replace(&mut self.buffer, String::new())));
52 }
53 }
54 }
55
56 let mut gi = self.buffer.grapheme_indices(true).fuse();
57 if let (Some((_, first_grapheme)), Some((second_pos, _))) = (gi.next(), gi.next()) {
58 let grapheme = first_grapheme.to_owned();
59 self.buffer = unsafe { self.buffer.get_unchecked(second_pos ..) }.to_owned();
60 return Some(Ok(grapheme));
61 }
62 // Otherwise, keep reading. We need at least the start of a second grapheme in the
63 // buffer before we know where the first one ends, because otherwise there could be
64 // additional combining marks ahead.
65 }
66 }
67}
68
69impl<R: Iterator<Item = io::Result<char>>> From<R> for Graphemes<R> {
70 fn from(input: R) -> Graphemes<R> {
71 Graphemes {
72 input,
73 buffer: String::new(),
74 pending_error: None,
75 }
76 }
77}