---- -*- Mode: rpl; -*-
----
---- csv.rpl RPL patterns for CSV files
----
---- © Copyright IBM Corporation 2017.
---- LICENSE: MIT License (https://opensource.org/licenses/mit-license.html)
---- AUTHOR: Jamie A. Jennings
package csv
-- A pattern for unquoted fields, i.e. match everything until the next delimeter:
local uqf_comma = [^,]*
local uqf_semicolon = [^;]*
local uqf_pipe = [^|]*
-- Next, we build up some definitions needed for parsing quoted fields.
-- Note: Some CSV files may use a pair of quote marks in a row to embed a quote mark in a string.
-- So we will defined an escaped quote as either the escape character (\) followed by a quote, or
-- two quotes in a row. And for good measure, also for two escaped quotes in a row.
local alias escaped_dquote = "\\\"" / "\\\"\\\"" / {["]["]} -- \" or \"\" or ""
local alias escaped_squote = "\\'" / "\\'\\'" / "''" -- \' or \'\' or ''
-- The character set that contains one double quote character can be written ["], but this can
-- interfere with syntax highlighting in some editors. So we will use [\"] instead. This is
-- equivalent because in rpl any character can be escaped. (Some characters, like 'n', have special
-- meanings when escaped: \n means newline.)
local dquoted_field_contents = {escaped_dquote / [^"]}* -- contents of a " quoted field
local dquoted_field = ["] dquoted_field_contents ["] -- the actual " quoted field
local squoted_field_contents = {escaped_squote / [^']}* -- contents of a quoted field
local squoted_field = ['] squoted_field_contents ['] -- the actual quoted field
-- No need to see 'quoted_field' in the json output, so make it an alias:
local alias quoted_field = dquoted_field / squoted_field
-- Patterns to match a single field:
alias field_comma = quoted_field / uqf_comma
alias field_semicolon = quoted_field / uqf_semicolon
alias field_pipe = quoted_field / uqf_pipe
-----------------------------------------------------------------------------
-- Top level patterns
-----------------------------------------------------------------------------
comma = field_comma {[,] field_comma}* $
semicolon = field_semicolon {[;] field_semicolon}* $
pipe = field_pipe {[|] field_pipe}* $