csv 0.8.0

CSV parsing with automatic type based decoding and encoding.
use std::error::FromError;
use std::io;
use std::str;

use serialize::Encodable;

use {ByteString, CsvResult, Encoded, Error, RecordTerminator};

/// The quoting style to use when writing CSV data.
#[deriving(Copy)]
pub enum QuoteStyle {
    /// This puts quotes around every field. Always.
    Always,
    /// This puts quotes around fields only when necessary.
    ///
    /// They are necessary when fields are empty or contain a quote, delimiter
    /// or record terminator.
    ///
    /// This is the default.
    Necessary,
    /// This *never* writes quotes.
    ///
    /// If a field requires quotes, then the writer will report an error.
    Never,
}

/// A CSV writer.
///
/// This writer provides a convenient interface for encoding CSV data. While
/// creating CSV data is much easier than parsing it, having a writer can
/// be convenient because it can handle quoting for you automatically.
/// Moreover, this particular writer supports `Encodable` types, which makes
/// it easy to write your custom types as CSV records.
///
/// All CSV data produced by this writer, with default options, conforms with
/// [RFC 4180](http://tools.ietf.org/html/rfc4180). (If certain options like
/// flexible record lengths are enabled, then compliance with RFC 4180 cannot
/// be guaranteed.)
///
/// One slight deviation is that records with a single empty field are always
/// encoded as `""`. This ensures that the record is not skipped since some
/// CSV parsers will ignore consecutive record terminators (like the one in
/// this crate).
///
/// ### Example
///
/// Here's an example that encodes word pairs and their edit distances:
///
/// ```rust
/// let records = vec![
///     ("sticker", "mortals", 7u),
///     ("bribed", "personae", 7u),
///     ("wobbling", "poncing", 4u),
///     ("interposed", "emmett", 9u),
///     ("chocolate", "refile", 7u),
/// ];
///
/// let mut wtr = csv::Writer::from_memory();
/// for record in records.into_iter() {
///     let result = wtr.encode(record);
///     assert!(result.is_ok());
/// }
/// ```
pub struct Writer<W> {
    buf: io::BufferedWriter<W>,
    delimiter: u8,
    record_terminator: RecordTerminator,
    flexible: bool,
    quote: u8,
    escape: u8,
    double_quote: bool,
    quote_style: QuoteStyle,
    first_len: uint,
}

impl Writer<io::IoResult<io::File>> {
    /// Creates a new `Writer` that writes CSV data to the file path given.
    ///
    /// The file is created if it does not already exist and is truncated
    /// otherwise.
    pub fn from_file(path: &Path) -> Writer<io::IoResult<io::File>> {
        Writer::from_writer(io::File::create(path))
    }
}


impl<W: io::Writer> Writer<W> {
    /// Creates a new CSV writer that writes to the `io::Writer` given.
    ///
    /// Note that the writer is buffered for you automatically.
    pub fn from_writer(w: W) -> Writer<W> {
        Writer::from_buffer(io::BufferedWriter::new(w))
    }

    /// Creates a new CSV writer that writes to the buffer given.
    ///
    /// This lets you specify your own buffered writer (e.g., use a different
    /// capacity). All other constructors wrap the writer given in a buffer
    /// with default capacity.
    pub fn from_buffer(buf: io::BufferedWriter<W>) -> Writer<W> {
        Writer {
            buf: buf,
            delimiter: b',',
            record_terminator: RecordTerminator::Any(b'\n'),
            flexible: false,
            quote: b'"',
            escape: b'\\',
            double_quote: true,
            quote_style: QuoteStyle::Necessary,
            first_len: 0,
        }
    }
}

impl Writer<Vec<u8>> {
    /// Creates a new CSV writer that writes to an in memory buffer. At any
    /// time, `to_string` or `to_bytes` can be called to retrieve the
    /// cumulative CSV data.
    pub fn from_memory() -> Writer<Vec<u8>> {
        Writer::from_writer(Vec::with_capacity(1024 * 64))
    }

    /// Returns the written CSV data as a string.
    pub fn as_string<'r>(&'r mut self) -> &'r str {
        match self.buf.flush() {
            // shouldn't panic with Vec<u8>
            Err(err) => panic!("Error flushing to Vec<u8>: {}", err),
            // This seems suspicious. If the client only writes `String`
            // values, then this can never fail. If the client is writing
            // byte strings, then they should be calling `to_bytes` instead.
            Ok(()) => str::from_utf8(self.buf.get_ref()[]).unwrap(),
        }
    }

    /// Returns the encoded CSV data as raw bytes.
    pub fn as_bytes<'r>(&'r mut self) -> &'r [u8] {
        match self.buf.flush() {
            // shouldn't panic with Vec<u8>
            Err(err) => panic!("Error flushing to Vec<u8>: {}", err),
            Ok(()) => self.buf.get_ref()[],
        }
    }
}

impl<W: io::Writer> Writer<W> {
    /// Writes a record by encoding any `Encodable` value.
    ///
    /// This is the most convenient way to write CSV data. Most Rust types
    /// map to CSV data in a straight forward way. A vector is just a sequence
    /// of fields. Similarly for a struct. Enumerations of zero or one
    /// arguments are supported too. (Enums with zero arguments encode to their
    /// name, while enums of one argument encode to their constituent value.)
    /// Option types are also supported (`None` encodes to an empty field).
    ///
    /// ### Example
    ///
    /// This example encodes word pairs that may or may not have their
    /// edit distances computed.
    ///
    /// ```rust
    /// extern crate serialize;
    /// # extern crate csv;
    /// # fn main() {
    ///
    /// #[deriving(Encodable)]
    /// struct Distance {
    ///     name1: &'static str,
    ///     name2: &'static str,
    ///     dist: Option<uint>,
    /// }
    ///
    /// let records = vec![
    ///     Distance { name1: "sticker", name2: "mortals", dist: None },
    ///     Distance { name1: "bribed", name2: "personae", dist: Some(7) },
    /// ];
    ///
    /// let mut wtr = csv::Writer::from_memory();
    /// for record in records.into_iter() {
    ///     let result = wtr.encode(record);
    ///     assert!(result.is_ok());
    /// }
    /// assert_eq!(wtr.as_string(),
    ///            "sticker,mortals,\nbribed,personae,7\n");
    /// # }
    /// ```
    pub fn encode<E: Encodable<Encoded, Error>>
                 (&mut self, e: E) -> CsvResult<()> {
        let mut erecord = Encoded::new();
        try!(e.encode(&mut erecord));
        self.write_bytes(erecord.unwrap().into_iter())
    }

    /// Writes a record of Unicode strings.
    ///
    /// This is meant to be the standard method provided by most CSV writers.
    /// That is, it writes a record of strings---no more and no less.
    ///
    /// This method accepts an iterator of *fields* for a single record. Each
    /// field must satisfy `Str`, which allows the caller to control
    /// allocation.
    ///
    /// ### Example
    ///
    /// This shows how to write string records.
    ///
    /// ```rust
    /// let records = vec![
    ///     vec!["sticker", "mortals", "7"],
    ///     vec!["bribed", "personae", "7"],
    ///     vec!["wobbling", "poncing", "4"],
    ///     vec!["interposed", "emmett", "9"],
    ///     vec!["chocolate", "refile", "7"],
    /// ];
    ///
    /// let mut wtr = csv::Writer::from_memory();
    /// for record in records.into_iter() {
    ///     let result = wtr.write(record.into_iter());
    ///     assert!(result.is_ok());
    /// }
    /// ```
    pub fn write<'a, Sized? S: 'a + Str, I: Iterator<&'a S>>
                (&mut self, r: I) -> CsvResult<()> {
        self.write_iter(r, |f| Ok(f.as_slice().as_bytes()))
    }

    /// Writes a record of *byte strings*.
    ///
    /// This is useful when you need to create CSV data that is not UTF-8
    /// encoded, or more likely, if you are transforming CSV data that you
    /// do not control with an unknown or malformed encoding.
    ///
    /// Note that this writes a *single* record. It accepts an iterator of
    /// *fields* for that record. Each field must satisfy the `Slice` trait.
    /// For example, your iterator can produce `Vec<u8>` or `&[u8]`, which
    /// allows you to avoid allocation if possible.
    ///
    /// ### Example
    ///
    /// This shows how to write records that do not correspond to a valid UTF-8
    /// encoding. (Note the use of Rust's byte string syntax!)
    ///
    /// ```rust
    /// let mut wtr = csv::Writer::from_memory();
    /// let result = wtr.write_bytes(vec![b"\xff", b"\x00"].into_iter());
    /// assert!(result.is_ok());
    ///
    /// assert_eq!(wtr.as_bytes(), b"\xff,\x00\n");
    /// ```
    pub fn write_bytes<S: AsSlice<u8>, I: Iterator<S>>
                      (&mut self, r: I) -> CsvResult<()> {
        self.write_iter(r, |f| Ok(f))
    }

    /// Writes a record of results. If any of the results resolve to an error,
    /// then writing stops and that error is returned.
    #[doc(hidden)]
    pub fn write_results<S: AsSlice<u8>, I: Iterator<CsvResult<S>>>
                        (&mut self, r: I) -> CsvResult<()> {
        self.write_iter(r, |f| f)
    }

    fn write_iter<T, R: AsSlice<u8>, I: Iterator<T>>
                 (&mut self, mut r: I, as_sliceable: |T| -> CsvResult<R>)
                 -> CsvResult<()> {
        let delim = self.delimiter;
        let mut count = 0;
        let mut last_len = 0;
        for field in r {
            if count > 0 {
                try!(self.w_bytes(&[delim]));
            }
            count += 1;
            let field = try!(as_sliceable(field));
            last_len = field.as_slice().len();
            try!(self.w_user_bytes(field.as_slice()));
        }
        // This tomfoolery makes sure that a record with a single empty field
        // is encoded as `""`. Otherwise, you end up with a run of consecutive
        // record terminators, which are ignored by some CSV parsers (such
        // as the one in this library).
        if count == 1 && last_len == 0 {
            let q = self.quote;
            try!(self.w_bytes(&[q, q]));
        }
        try!(self.w_lineterm());
        self.set_first_len(count)
    }

    /// Flushes the underlying buffer.
    pub fn flush(&mut self) -> CsvResult<()> {
        self.buf.flush().map_err(FromError::from_error)
    }
}

impl<W: io::Writer> Writer<W> {
    /// The delimiter to use when writing CSV data.
    ///
    /// Since the CSV writer is meant to be mostly encoding agnostic, you must
    /// specify the delimiter as a single ASCII byte. For example, to write
    /// tab-delimited data, you would use `b'\t'`.
    ///
    /// The default value is `b','`.
    pub fn delimiter(mut self, delimiter: u8) -> Writer<W> {
        self.delimiter = delimiter;
        self
    }

    /// Whether to allow flexible length records when writing CSV data.
    ///
    /// When this is set to `true`, records in the CSV data can have different
    /// lengths. By default, this is disabled, which will cause the CSV writer
    /// to return an error if it tries to write a record that has a different
    /// length than other records it has already written.
    pub fn flexible(mut self, yes: bool) -> Writer<W> {
        self.flexible = yes;
        self
    }

    /// Sets the record terminator to use when writing CSV data.
    ///
    /// By default, this is `RecordTerminator::Any(b'\n')`. If you want to
    /// use CRLF (`\r\n`) line endings, then use `RecordTerminator:CRLF`.
    pub fn record_terminator(mut self, term: RecordTerminator) -> Writer<W> {
        self.record_terminator = term;
        self
    }

    /// Set the quoting style to use when writing CSV data.
    ///
    /// By default, this is set to `QuoteStyle::Necessary`, which will only
    /// use quotes when they are necessary to preserve the integrity of data.
    pub fn quote_style(mut self, style: QuoteStyle) -> Writer<W> {
        self.quote_style = style;
        self
    }

    /// Set the quote character to use when writing CSV data.
    ///
    /// Since the CSV parser is meant to be mostly encoding agnostic, you must
    /// specify the quote as a single ASCII byte. For example, to write
    /// single quoted data, you would use `b'\''`.
    ///
    /// The default value is `b'"'`.
    pub fn quote(mut self, quote: u8) -> Writer<W> {
        self.quote = quote;
        self
    }

    /// Set the escape character to use when writing CSV data.
    ///
    /// This is only used when `double_quote` is set to `false`.
    ///
    /// Since the CSV parser is meant to be mostly encoding agnostic, you must
    /// specify the escape as a single ASCII byte.
    ///
    /// The default value is `b'\\'`.
    pub fn escape(mut self, escape: u8) -> Writer<W> {
        self.escape = escape;
        self
    }

    /// Set the quoting escape mechanism.
    ///
    /// When enabled (which is the default), quotes are escaped by doubling
    /// them. e.g., `"` escapes to `""`.
    ///
    /// When disabled, quotes are escaped with the escape character (which
    /// is `\\` by default).
    pub fn double_quote(mut self, yes: bool) -> Writer<W> {
        self.double_quote = yes;
        self
    }
}

impl<W: io::Writer> Writer<W> {
    fn err<S: StrAllocating, T>(&self, msg: S) -> CsvResult<T> {
        Err(Error::Encode(msg.into_string()))
    }

    fn w_bytes(&mut self, s: &[u8]) -> CsvResult<()> {
        self.buf.write(s).map_err(Error::Io)
    }

    fn w_user_bytes(&mut self, s: &[u8]) -> CsvResult<()> {
        if try!(self.should_quote(s)) {
            let quoted = self.quote_field(s);
            self.w_bytes(quoted.as_slice())
        } else {
            self.w_bytes(s)
        }
    }

    fn w_lineterm(&mut self) -> CsvResult<()> {
        match self.record_terminator {
            RecordTerminator::CRLF => self.w_bytes(b"\r\n"),
            RecordTerminator::Any(b) => self.w_bytes(&[b]),
        }
    }

    fn set_first_len(&mut self, cur_len: uint) -> CsvResult<()> {
        if cur_len == 0 {
            return self.err("Records must have length greater than 0.")
        }
        if !self.flexible {
            if self.first_len == 0 {
                self.first_len = cur_len;
            } else if self.first_len != cur_len {
                return self.err(format!(
                    "Record has length {} but other records have length {}",
                    cur_len, self.first_len))
            }
        }
        Ok(())
    }

    fn should_quote(&self, field: &[u8]) -> CsvResult<bool> {
        let needs = || field.iter().any(|&b| self.byte_needs_quotes(b));
        match self.quote_style {
            QuoteStyle::Always => Ok(true),
            QuoteStyle::Necessary => Ok(needs()),
            QuoteStyle::Never => {
                if !needs() {
                    Ok(false)
                } else {
                    self.err(format!(
                        "Field requires quotes, but quote style \
                         is 'Never': '{}'",
                        String::from_utf8_lossy(field)))
                }
            }
        }
    }

    fn byte_needs_quotes(&self, b: u8) -> bool {
        b == self.delimiter
        || self.record_terminator == b
        || b == self.quote
        // This is a bit hokey. By default, the record terminator is
        // '\n', but we still need to quote '\r' because the reader
        // interprets '\r' as a record terminator by default.
        || b == b'\r' || b == b'\n'
    }

    fn quote_field(&self, mut s: &[u8]) -> ByteString {
        let mut buf = Vec::with_capacity(s.len() + 2);

        buf.push(self.quote);
        loop {
            match s.position_elem(&self.quote) {
                None => {
                    buf.push_all(s);
                    break
                }
                Some(next_quote) => {
                    buf.push_all(s.slice_to(next_quote));
                    if self.double_quote {
                        buf.push(self.quote);
                        buf.push(self.quote);
                    } else {
                        buf.push(self.escape);
                        buf.push(self.quote);
                    }
                    s = s.slice_from(next_quote + 1);
                }
            }
        }
        buf.push(self.quote);
        ByteString::from_bytes(buf)
    }
}