1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
use std::collections::{BTreeMap, BTreeSet};

#[derive(PartialEq, Debug, Clone)]
pub enum Element {
    Nil,
    String(String),
    Boolean(bool),
    Character(char),
    Symbol(String),
    Keyword(String),
    Integer(i64),
    Float(f64),
    List(Vec<Element>),
    Vector(Vec<Element>),
    Map(BTreeMap<Element, Element>),
    Set(BTreeSet<Element>),
}

// Tagged Elements
// # followed immediately by a symbol starting with an alphabetic character indicates that that symbol is a tag.
// A tag indicates the semantic interpretation of the following element.
// It is envisioned that a reader implementation will allow clients to register handlers for specific tags.
// Upon encountering a tag, the reader will first read the next element (which may itself be or comprise other tagged elements),
// then pass the result to the corresponding handler for further interpretation, and the result of the handler will be the data value
// yielded by the tag + tagged element, i.e. reading a tag and tagged element yields one value.
// This value is the value to be returned to the program and is not further interpreted as edn data by the reader.
#[derive(PartialEq, Debug, Clone)]
pub struct Tagged {
    name: String,
    element: Box<Element>,
}

pub mod parsers {
    use super::{Element, Tagged};

    use nom;
    use nom::IResult;

    mod helpers {
        use nom;
        use nom::bytes::complete::take_while1;
        use nom::IResult;

        fn is_nonescaped_string_char(c: char) -> bool {
            let cv = c as u32;
            (cv >= 0x20) && (cv != 0x22) && (cv != 0x5C)
        }

        fn nonescaped_string(input: &str) -> IResult<&str, &str> {
            take_while1(is_nonescaped_string_char)(input)
        }
    }
    // we're going to use the full paths for the nom functions in eden, because I'd like to replace
    // complete with streaming whenever applicable. this is because EDN is suitable as a streaming
    // format (due to not having a structure that wraps everything) and I think it makes sense.

    // reference:
    // https://github.com/edn-format/edn

    //// basic types
    // nil represents nil, null or nothing. It should be read as an object with similar meaning on the target platform.
    fn nil<'a>(input: &'a str) -> IResult<&'a str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // true and false should be mapped to booleans.
    // If a platform has canonic values for true and false, it is a further semantic of booleans that all instances of true yield that (identical) value, and similarly for false.
    fn boolean<'a>(input: &'a str) -> IResult<&'a str, Element> {
        nom::branch::alt((
            nom::combinator::value(Element::Boolean(false), nom::bytes::complete::tag("false")),
            nom::combinator::value(Element::Boolean(true), nom::bytes::complete::tag("true")),
        ))(input)
    }

    // Unicode characters are represented with \uNNNN as in Java. Backslash cannot be followed by whitespace.
    // Characters are preceded by a backslash: \c, \newline, \return, \space and \tab yield the corresponding characters.
    fn character<'a>(input: &'a str) -> IResult<&'a str, Element> {
        let parse_hexdigit =
            nom::bytes::complete::take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());

        // `preceeded` takes a prefix parser, and if it succeeds, returns the result
        // of the body parser. In this case, it parses uNNNN.
        let parse_hextag =
            nom::sequence::preceded(nom::character::complete::char('u'), parse_hexdigit);

        let parse_u32 =
            nom::combinator::map_res(parse_hextag, move |hex| u32::from_str_radix(hex, 16));

        let parse_char = nom::combinator::map_opt(parse_u32, |value| std::char::from_u32(value));

        nom::combinator::map(parse_char, |c| Element::Character(c))(input)
    }

    // Strings are enclosed in "double quotes". May span multiple lines. Standard C/Java escape characters \t, \r, \n, \\ and \" are supported.
    fn string(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // Symbols begin with a non-numeric character and can contain alphanumeric characters and . * + ! - _ ? $ % & = < >.
    // If -, + or . are the first character, the second character (if any) must be non-numeric.
    // Additionally, : # are allowed as constituent characters in symbols other than as the first character.

    // / has special meaning in symbols. It can be used once only in the middle of a symbol to separate the prefix (often a namespace) from the name,
    // e.g. my-namespace/foo. / by itself is a legal symbol, but otherwise neither the prefix nor the name part can be empty when the symbol contains /

    // If a symbol has a prefix and /, the following name component should follow the first-character restrictions for symbols as a whole.
    // This is to avoid ambiguity in reading contexts where prefixes might be presumed as implicitly included namespaces and elided thereafter.
    fn symbol(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // Symbols are used to represent identifiers, and should map to something other than strings, if possible.

    // Keywords are identifiers that typically designate themselves. They are semantically akin to enumeration values.
    // Keywords follow the rules of symbols, except they can (and must) begin with :, e.g. :fred or :my/fred.
    // If the target platform does not have a keyword type distinct from a symbol type, the same type can be used without conflict,
    // since the mandatory leading : of keywords is disallowed for symbols.
    // Per the symbol rules above, :/ and :/anything are not legal keywords. A keyword cannot begin with ::

    // If the target platform supports some notion of interning, it is a further semantic of keywords that all instances of the same keyword yield the identical object.

    fn keyword(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }
    // Integers consist of the digits 0 - 9, optionally prefixed by - to indicate a negative number, or (redundantly) by +. No integer other than 0 may begin with 0. 64-bit (signed integer) precision is expected. An integer can have the suffix N to indicate that arbitrary precision is desired. -0 is a valid integer not distinct from 0.
    /*
    integer
      int
      int N
    digit
      0-9
    int
      digit
      1-9 digits
      + digit
      + 1-9 digits
      - digit
      - 1-9 digits
    */

    fn integer(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // 64-bit (double) precision is expected.
    // floating-point-number
    /*
      int M
      int frac
      int exp
      int frac exp
    digit
      0-9
    int
      digit
      1-9 digits
      + digit
      + 1-9 digits
      - digit
      - 1-9 digits
    frac
      . digits
    exp
      ex digits
    digits
      digit
      digit digits
    ex
      e
      e+
      e-
      E
      E+
      E-
    */
    fn floating(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    //// collections
    // A list is a sequence of values. Lists are represented by zero or more elements enclosed in parentheses (). Note that lists can be heterogeneous.
    fn list(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // A vector is a sequence of values that supports random access.
    // Vectors are represented by zero or more elements enclosed in square brackets [].
    // Note that vectors can be heterogeneous.
    fn vector(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // A map is a collection of associations between keys and values.
    // Maps are represented by zero or more key and value pairs enclosed in curly braces {}.
    // Each key should appear at most once. No semantics should be associated with the order in which the pairs appear.
    fn map(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    // A set is a collection of unique values. Sets are represented by zero or more elements enclosed in curly braces preceded by # #{}.
    // No semantics should be associated with the order in which the elements appear. Note that sets can be heterogeneous.
    fn set(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    fn element(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }

    fn tag(input: &str) -> IResult<&str, Element> {
        nom::combinator::value(Element::Nil, nom::bytes::complete::tag("nil"))(input)
    }
}