csv_guillotine/
lib.rs

1extern crate csv;
2
3use std::io::Read;
4use csv::ReaderBuilder;
5
6#[cfg(test)]
7use std::io::BufReader;
8
9#[cfg(test)]
10struct FakeCsvReader {
11    src: String,
12    pos: usize,
13    max_read: Option<usize>,
14}
15
16
17#[cfg(test)]
18impl FakeCsvReader {
19    pub fn new_by_size(strng: String, size: usize) -> FakeCsvReader {
20        return FakeCsvReader {
21            src: strng,
22            pos: 0,
23            max_read: Option::Some(size),
24        }
25    }
26    pub fn new(strng: String) -> FakeCsvReader {
27        return FakeCsvReader {
28            src: strng,
29            pos: 0,
30            max_read: Option::None,
31        }
32    }
33}
34
35
36#[cfg(test)]
37impl Read for FakeCsvReader {
38
39    fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
40
41        let mut to_read = self.src.len() - self.pos;
42
43        if to_read > buf.len() {
44            to_read = buf.len();
45        }
46
47        if to_read > self.max_read.unwrap_or(to_read) {
48            to_read = self.max_read.unwrap_or(to_read);
49        }
50
51        if to_read == 0 {
52            return Result::Ok(0);
53        }
54
55        for i in 0..to_read {
56            buf[i] = self.src.as_bytes()[i + self.pos];
57        }
58
59        self.pos = self.pos + to_read;
60
61        Result::Ok(to_read)
62
63    }
64
65}
66
67
68#[test]
69fn fake_reader_works() {
70
71    let fr = FakeCsvReader::new("hi there".to_string());
72    let mut f = BufReader::new(fr);
73    let mut buffer = String::new();
74
75    match f.read_to_string(&mut buffer) {
76        Ok(r) => {
77            println!("RESULT: {}: {}", r, buffer);
78        },
79        Err(e) => println!("ERROR: {}", e)
80    }
81
82    assert_eq!(buffer, "hi there".to_string());
83}
84
85
86struct BufferAcc {
87    current_line: usize,
88    max_line: usize,
89    count: usize,
90}
91
92impl std::fmt::Debug for BufferAcc {
93    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
94        write!(
95            f,
96            "BufferAcc({},{},{})",
97            self.count,
98            self.max_line,
99            self.current_line,
100        )
101    }
102}
103
104fn is_nl(c: u8) -> bool {
105    if (c == 13) || (c == 10) {
106        return true;
107    }
108    false
109}
110
111
112fn has_nl(input: &[u8]) -> bool {
113    let mut last: u8 = 0;
114    for inp in input {
115        if is_nl(last) {
116            return true;
117        }
118        last = *inp;
119    }
120    false
121}
122
123
124#[cfg(test)]
125fn str_to_vec(s: String) -> Vec<u8> {
126    let mut v = vec![];
127    let bs = s.as_bytes();
128    for i in 0..bs.len() {
129        v.push(bs[i]);
130    }
131    v
132}
133
134
135#[test]
136fn test_has_nl() {
137    assert_eq!(has_nl(&str_to_vec("hi there\r\n".to_string())), true);
138    assert_eq!(has_nl(&str_to_vec("hi there\nbob".to_string())), true);
139    assert_eq!(has_nl(&str_to_vec("hi there\r".to_string())), false);
140    assert_eq!(has_nl(&str_to_vec("hi there\n".to_string())), false);
141    assert_eq!(has_nl(&str_to_vec("hi there".to_string())), false);
142}
143
144
145/// Writes bytes from `rdr` to `v`, however it will keep doing this until at least
146/// one new line is found.
147fn fill<R>(mut rdr: R, v: &mut Vec<u8>) -> Result<(), std::io::Error> where R: Read {
148    let mut number_of_bytes_read = 999;
149    while (number_of_bytes_read > 0) && (!has_nl(&v)) {
150        #[cfg(test)]
151        let mut bytes = [0; 8];
152        #[cfg(not(test))]
153        let mut bytes = [0; 8192];
154        number_of_bytes_read = rdr.read(&mut bytes)?;
155        v.append(&mut bytes[0..number_of_bytes_read].to_vec());
156    }
157    Result::Ok(())
158}
159
160
161#[test]
162fn test_fill() {
163    let csv = vec![
164        "full of trash".to_string(),
165        "but before the real data".to_string(),
166        "a".to_string(),
167    ];
168    let mut fr = FakeCsvReader::new(csv.join("\n"));
169    let mut unprocessed = "This is a header ".to_string().as_bytes().to_vec();
170    fill(&mut fr, &mut unprocessed).unwrap();
171    let mut fr_buffer = String::new();
172    fr.read_to_string(&mut fr_buffer).unwrap();
173    assert_eq!(fr_buffer, "t before the real data\na");
174    assert_eq!(
175        unprocessed,
176        "This is a header full of trash\nbu".to_string().as_bytes()
177    );
178}
179
180
181fn get_line(unprocessed: &mut Vec<u8>) -> Vec<u8> {
182
183    let get_byte_count = || {
184        let i = 0;
185        for i in 1..unprocessed.len() {
186            match (is_nl(unprocessed[i - 1]), is_nl(unprocessed[i])) {
187                (true, true) => return i + 1,
188                (true, false) => return i,
189                (false, _) => {}
190            }
191        }
192        i
193    };
194
195    let byte_count = get_byte_count();
196
197    let r = unprocessed[..byte_count].to_vec();
198    unprocessed.drain(..byte_count);
199    r
200
201}
202
203#[test]
204fn test_get_line() {
205    let mut unp1 = str_to_vec("hi there\r\n".to_string());
206    let r1 = get_line(&mut unp1);
207    assert_eq!(r1, str_to_vec("hi there\r\n".to_string()));
208    assert_eq!(unp1, str_to_vec("".to_string()));
209
210    let mut unp2 = str_to_vec("hi there\nhow are you bob?".to_string());
211    let r2 = get_line(&mut unp2);
212    assert_eq!(r2, str_to_vec("hi there\n".to_string()));
213    assert_eq!(unp2, str_to_vec("how are you bob?".to_string()));
214
215    let mut unp3 = str_to_vec("\rhi there".to_string());
216    let r3 = get_line(&mut unp3);
217    assert_eq!(r3, str_to_vec("\r".to_string()));
218    assert_eq!(unp3, str_to_vec("hi there".to_string()));
219
220    let mut unp3 = str_to_vec("hi there".to_string());
221    let r3 = get_line(&mut unp3);
222    assert_eq!(r3, str_to_vec("".to_string()));
223    assert_eq!(unp3, str_to_vec("hi there".to_string()));
224
225    let mut unp4 = str_to_vec("".to_string());
226    let r4 = get_line(&mut unp4);
227    assert_eq!(r4, str_to_vec("".to_string()));
228    assert_eq!(unp4, str_to_vec("".to_string()));
229}
230
231
232fn count_seperators(field_seperator: u8, line: &[u8]) -> usize {
233
234    let mut rdr = ReaderBuilder::new()
235        .delimiter(field_seperator)
236        .has_headers(false)
237        .from_reader(line);
238
239    match rdr.byte_records().next() {
240        Some(rec) => {
241            rec.unwrap_or_default().len()
242        }
243        None => 0
244    }
245
246}
247
248
249#[test]
250fn test_count_seperators() {
251    assert_eq!(
252        count_seperators(44, &str_to_vec("This,has,4,fields".to_string())),
253        4
254        );
255}
256
257
258type Buffer = Vec<Vec<u8>>;
259
260pub struct Blade<R: Read> {
261    rdr: R,
262    field_seperator: u8,
263    buffer: Buffer,
264    unprocessed: Buffer,
265    prepared: bool,
266    consider_lines: usize,
267}
268
269
270/// Takes either a line (sub vector) or part of a line (if `return_buf` is too
271/// small) from `src_buffer` and moves it into `return_buf`.
272fn read_from_buffer(src_buffer: &mut Buffer, return_buf: &mut [u8]) -> Result<usize, std::io::Error> {
273
274    if src_buffer.is_empty() {
275        return Result::Ok(0);
276    }
277
278    let mut count = src_buffer[0].len();
279    let mut shift = true;
280    let as_bytes = src_buffer.remove(0);
281
282    if return_buf.len() < as_bytes.len() {
283        count = return_buf.len();
284        shift = false;
285    }
286
287    return_buf[..count].clone_from_slice(&as_bytes[..count]);
288
289    if !shift {
290        src_buffer.insert(0, as_bytes[count..].to_vec());
291    }
292
293    Result::Ok(count)
294}
295
296
297#[test]
298fn test_read_from_buffer_empty() {
299    let mut return_buffer = [0; 4];
300    let mut src_buffer: Buffer = vec![vec![]];
301    let expected: Buffer = vec![];
302    assert_eq!(
303        read_from_buffer(&mut src_buffer, &mut return_buffer).unwrap_or_default(),
304        0
305    );
306    assert_eq!(return_buffer, [0; 4]);
307    assert_eq!(src_buffer, expected);
308}
309
310
311#[test]
312fn test_read_from_buffer_full_line() {
313    let mut return_buffer = [0; 4];
314    let mut src_buffer = vec![vec![1, 2, 3, 4], vec![5, 6, 7, 8], vec![9]];
315    assert_eq!(
316        read_from_buffer(&mut src_buffer, &mut return_buffer).unwrap_or_default(),
317        4
318    );
319    assert_eq!(return_buffer, [1, 2, 3, 4]);
320    assert_eq!(src_buffer, vec![vec![5, 6, 7, 8], vec![9]]);
321}
322
323
324#[test]
325fn test_read_from_buffer_partial_line() {
326    let mut return_buffer = [0; 8];
327    let mut src_buffer = vec![vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]];
328    assert_eq!(
329        read_from_buffer(&mut src_buffer, &mut return_buffer).unwrap_or_default(),
330        8
331    );
332    assert_eq!(return_buffer, [1, 2, 3, 4, 5, 6, 7, 8]);
333    assert_eq!(src_buffer, vec![vec![9, 10, 11, 12, 13, 14]]);
334}
335
336
337/// Reads `consider_lines` from `rdr` putting them into `process_buffer`. Any
338/// left over lines will be put in `unprocessed`.
339fn prepare_fill<R>(consider_lines: usize, mut rdr: R, process_buffer: &mut Buffer, unprocessed: &mut Buffer) -> Result<(), std::io::Error> where R: Read {
340    let mut did_read = true;
341
342    let mut read_buffer = vec![];
343
344    while (process_buffer.len() < consider_lines) && did_read {
345        fill(&mut rdr, &mut read_buffer)?;
346        let mut added_length = 9;
347        did_read = false;
348        while (process_buffer.len() < consider_lines) && added_length > 0 {
349            let line = get_line(&mut read_buffer);
350            added_length = line.len();
351            if added_length > 0 {
352                process_buffer.push(line);
353                did_read = true;
354            }
355        }
356    }
357
358    let mut unproc = vec![];
359    for r in read_buffer {
360        unproc.push(r);
361    }
362
363    unprocessed.push(unproc);
364
365    Result::Ok(())
366
367}
368
369
370#[test]
371fn test_prepare_fill_needs_multiple_reads() {
372    let csv = vec![
373        "01234".to_string(),
374        "56789".to_string(),
375        "abcde".to_string(),
376        "defgh".to_string(),
377    ];
378    let fr = FakeCsvReader::new_by_size(csv.join("\n"), 7);
379
380    let mut return_buffer: Buffer = vec![];
381    let mut unprocessed: Buffer = vec![];
382    assert_eq!(
383        prepare_fill(2, fr, &mut return_buffer, &mut unprocessed).unwrap(),
384        ()
385    );
386
387    let expected: Buffer = vec![
388        vec![48, 49, 50, 51, 52, 10],
389        vec![53, 54, 55, 56, 57, 10]
390    ];
391    assert_eq!(return_buffer, expected);
392    assert_eq!(unprocessed, vec![vec![97, 98]]);
393}
394
395
396impl<R> Blade<R> where R: Read {
397
398    fn prepare(&mut self) -> Result<usize, std::io::Error> {
399
400        let mut process_buffer = vec![];
401        let mut unprocessed = vec![];
402
403        prepare_fill(self.consider_lines, &mut self.rdr, &mut process_buffer, &mut unprocessed)?;
404
405        self.unprocessed = unprocessed;
406
407        let max = (&process_buffer).iter().fold(
408            BufferAcc { count: 0, current_line: 0, max_line: 0 },
409            |acc, line| {
410                let c = count_seperators(
411                    self.field_seperator,
412                    line
413                );
414                if c <= acc.count {
415                    let r = BufferAcc { current_line: acc.current_line + 1, ..acc };
416                    return r;
417                }
418
419                BufferAcc {
420                    count: c,
421                    current_line: acc.current_line + 1,
422                    max_line: acc.current_line
423                }
424            }
425        );
426
427        while process_buffer.len() > max.max_line {
428            self.buffer.push(process_buffer.remove(max.max_line).to_vec());
429        }
430
431        Result::Ok(self.buffer.len())
432
433    }
434
435
436    pub fn new(reader: R, field_seperator: u8, consider_lines: usize) -> Blade<R> {
437        Blade {
438            rdr: reader,
439            field_seperator,
440            unprocessed: vec![],
441            buffer: vec![],
442            prepared: false,
443            consider_lines
444        }
445    }
446
447    fn read_rest(&mut self, return_buf: &mut [u8]) -> Result<usize, std::io::Error> {
448        let length = self.unprocessed.len();
449        if length > 0 {
450            return read_from_buffer(&mut self.unprocessed, return_buf);
451        }
452        self.rdr.read(return_buf)
453    }
454}
455
456
457impl<R> Read for Blade<R> where R: Read {
458
459    fn read(&mut self, return_buf: &mut [u8]) -> Result<usize, std::io::Error> {
460
461        if !self.prepared {
462            self.prepare()?;
463            self.prepared = true;
464        }
465
466        if self.buffer.is_empty() {
467            return self.read_rest(return_buf);
468        }
469
470        read_from_buffer(&mut self.buffer, return_buf)
471
472    }
473
474}
475
476
477#[test]
478fn it_skips_header() {
479
480    let csv = vec![
481        "This is a header".to_string(),
482        "Full of nonsense, rubbish and problems".to_string(),
483        "but before the real data".to_string(),
484        "name,age,gender".to_string(),
485        "bob,22,M".to_string(),
486        "jane,21,F".to_string(),
487        "freddy,19,M".to_string()
488    ];
489    let fr = FakeCsvReader::new(csv.join("\n"));
490    let rf = Blade::new(fr, 44, 20);
491    let mut br = BufReader::new(rf);
492    let mut buffer = String::new();
493
494    match br.read_to_string(&mut buffer) {
495        Ok(r) => {
496            println!("RESULT: {}: {}", r, buffer);
497        },
498        Err(e) => println!("ERROR: {}", e)
499    }
500
501    assert_eq!(buffer, csv[3..].join("\n"));
502}
503
504
505#[test]
506fn it_only_considers_upto_considers() {
507    let csv = vec![
508        "This is a header".to_string(),
509        "Full of nonsense, rubbish and problems".to_string(),
510        "but before the real data".to_string(),
511        "name,age,gender".to_string(),
512        "bob,22,M".to_string(),
513        "jane,21,F".to_string(),
514        "freddy,19,M".to_string()
515    ];
516    let fr = FakeCsvReader::new(csv.join("\n"));
517    let rf = Blade::new(fr, 44, 3);
518    let mut br = BufReader::new(rf);
519    let mut buffer = String::new();
520
521    match br.read_to_string(&mut buffer) {
522        Ok(r) => {
523            println!("RESULT: {}: {}", r, buffer);
524        },
525        Err(e) => println!("ERROR: {}", e)
526    }
527
528    assert_eq!(buffer, csv[1..].join("\n"));
529}
530
531