1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
extern crate aho_corasick;
pub use self::aho_corasick::AcAutomaton; 

use self::aho_corasick::{Automaton, StreamMatches};
use std::io::{Result as IoResult, Read};
use std::clone::Clone;

use std::cell::RefCell;
use std::rc::Rc;

mod grow;
use self::grow::Grow;

#[derive(Debug)]
struct GrowWrap<R>(Rc<RefCell<Grow<R>>>);

impl<R: Read> Read for GrowWrap<R> {
    fn read(&mut self, buf: &mut [u8]) -> IoResult<usize> {
        self.0.borrow_mut().read(buf)
    }
}

/// An iterator that splits input by arbitrary number of byte sequences  
#[derive(Debug)]
pub struct SplitByIter<'a, R, A: Automaton<&'a [u8]> + 'a > {
    g: Rc<RefCell<Grow<R>>>,
    pos: usize,
    matches: StreamMatches<'a, GrowWrap<R>, &'a [u8], A>,
}

impl<'a, R: Read, A: Automaton<&'a [u8]> > SplitByIter<'a, R, A> {

    fn _next(&mut self) -> Option<IoResult<Vec<u8> > > {
        // TODO: find out why moving the line below here from both branches causes a panic
        // let mut g = self.g.borrow_mut();
        match self.matches.next() {
            None => {
                let mut g = self.g.borrow_mut();
                let rest: Vec<u8> = g.drain_all().collect();

                if rest.len() == 0 {
                    None
                } else {
                    Some(Ok(rest))
                }
            },
            Some(m) => {
                let mut g = self.g.borrow_mut();
                match m {
                    Ok(m) => { 
                        let pos = self.pos;
                        let found: Vec<u8> = {
                            let i = g.iter().map(|&v| v);
                            i.take(m.start - pos).collect()
                        };

                        let len = m.end - pos;
                        self.pos += len;

                        g.drain(len);
                        Some(Ok(found))
                    },
                    Err(err) => Some(Err(err))
                }
            } 
        }
    }
}

impl<'a, R: Read, A: Automaton<&'a [u8]> > Iterator for SplitByIter<'a, R, A> {
    type Item = IoResult<Vec<u8>>;
    
    fn next(&mut self) -> Option<Self::Item> {
        loop {
            match self._next() {
                None => return None,
                Some(v) => {
                    match v {
                        Ok(v) => if v.len() != 0 {
                            return Some(Ok(v))
                        },
                        Err(err) => return Some(Err(err))
                    }
                }
            }
        }
    }
}

/// Allows spliting any Read stream by arbitrary number of byte sequences
///
/// # Examples
///
/// ```
/// extern crate split_by;
/// 
/// use split_by::{SplitBy, AcAutomaton};
/// 
/// # fn main() {
/// let ac = AcAutomaton::new(vec!["--------".as_bytes(), "********".as_bytes(), "########".as_bytes()]);
/// let mut splits = br#"first
/// --------
/// second
/// ********########
/// third
/// #################
/// last"#.split_by(&ac);
///
/// assert!(splits.next().unwrap().unwrap().as_slice() == b"first\n");
/// assert!(splits.next().unwrap().unwrap().as_slice() == b"\nsecond\n");
/// assert!(splits.next().unwrap().unwrap().as_slice() == b"\nthird\n");
/// assert!(splits.next().unwrap().unwrap().as_slice() == b"\nlast");
/// assert!(splits.next().is_none());
/// # }
/// ```
///
/// The iterator never produces empty vec, even if the input begins or ends with the splitter
/// or if there are consecutive splitters present
pub trait SplitBy<'a, R: Read> {
    fn split_by<A: Automaton<&'a [u8]> >(self, searcher: &'a A) -> SplitByIter<'a, R, A> where Self: Read;
}

impl<'a, R: Read> SplitBy<'a, R> for R {

    fn split_by<A: Automaton<&'a [u8]> >(self, searcher: &'a A) -> SplitByIter<'a, R, A> where Self: Read {
        
        let ref_g = Rc::new(RefCell::new(Grow::new(self)));
        SplitByIter {
            g: ref_g.clone(),
            pos: 0,
            matches: searcher.stream_find(GrowWrap(ref_g)),
        }
    }
}


#[cfg(test)]
mod tests {
    use super::{AcAutomaton, SplitBy};

    #[test]
    fn leading() {
        assert!("==1==2==3==4==5==6==7==8".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8']);
    }
    #[test]
    fn trailing() {
        assert!("1==2==3==4==5==6==7==8==".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8']);
    }
    #[test]
    fn both() {
        assert!("1==2==3==4==5==6==7==8".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8']);
    }
    #[test]
    fn consecutive() {
        assert!("1====2==3==4==5==6==7==8".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8']);
    }
    #[test]
    fn empty() {
        assert!("".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![]);
    }
    #[test]
    fn plain() {
        assert!("==".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec![]);
    }
    #[test]
    fn not_present() {
        assert!("12345678".as_bytes().split_by(&AcAutomaton::new(vec!["==".as_bytes()])).map(|f| f.unwrap()[0]).collect::<Vec<u8>>() == vec!["12345678".as_bytes()].iter().map(|f|f[0]).collect::<Vec<u8>>());
    }
}