unicode_reader/
graphemes.rs

1// Copyright (c) 2016-2019 William R. Fraser
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8
9use unicode_segmentation::UnicodeSegmentation;
10use std::io;
11use std::mem;
12
13/// Wraps a `char`-oriented reader and yields the data one Unicode grapheme cluster at a time.
14pub struct Graphemes<R: Iterator<Item = io::Result<char>>> {
15    input: R,
16    buffer: String,
17    pending_error: Option<io::Error>,
18}
19
20impl<R: Iterator<Item = io::Result<char>>> Iterator for Graphemes<R> {
21    /// The type of the elements being iterated over: a `io::Result` with one Unicode grapheme
22    /// cluster, or any I/O error raised by the underlying reader.
23    type Item = io::Result<String>;
24
25    /// Get the next grapheme cluster from the stream. Note that because grapheme clusters are of
26    /// indeterminate length, this has to read the underlying reader until the *next* cluster
27    /// starts before it can return a grapheme.
28    fn next(&mut self) -> Option<Self::Item> {
29        if let Some(err) = self.pending_error.take() {
30            return Some(Err(err));
31        }
32        loop {
33            match self.input.next() {
34                Some(Ok(codepoint)) => {
35                    self.buffer.push(codepoint);
36                }
37                None => {
38                    if self.buffer.is_empty() {
39                        return None;
40                    } else {
41                        return Some(Ok(mem::replace(&mut self.buffer, String::new())));
42                    }
43                }
44                Some(Err(e)) => {
45                    if self.buffer.is_empty() {
46                        return Some(Err(e));
47                    } else {
48                        // If the buffer is non-empty, consider the grapheme done and return it,
49                        // but save the error and raise it next time around.
50                        self.pending_error = Some(e);
51                        return Some(Ok(mem::replace(&mut self.buffer, String::new())));
52                    }
53                }
54            }
55
56            let mut gi = self.buffer.grapheme_indices(true).fuse();
57            if let (Some((_, first_grapheme)), Some((second_pos, _))) = (gi.next(), gi.next()) {
58                let grapheme = first_grapheme.to_owned();
59                self.buffer = unsafe { self.buffer.get_unchecked(second_pos ..) }.to_owned();
60                return Some(Ok(grapheme));
61            }
62            // Otherwise, keep reading. We need at least the start of a second grapheme in the
63            // buffer before we know where the first one ends, because otherwise there could be
64            // additional combining marks ahead.
65        }
66    }
67}
68
69impl<R: Iterator<Item = io::Result<char>>> From<R> for Graphemes<R> {
70    fn from(input: R) -> Graphemes<R> {
71        Graphemes {
72            input,
73            buffer: String::new(),
74            pending_error: None,
75        }
76    }
77}