gedcom_rs/types/
line.rs

1// use std::str::FromStr;
2
3use winnow::ascii::{alphanumeric1, digit1, line_ending, not_line_ending, space0};
4use winnow::combinator::{opt, preceded, separated_pair};
5use winnow::error::StrContext;
6use winnow::prelude::*;
7use winnow::stream::Stream;
8use winnow::token::{tag, take_till};
9
10/// A GEDCOM line
11/// level + delim (space) + [optional_xref_ID] + tag + [optional_line_value] + terminator
12#[derive(Debug, Eq, PartialEq, Clone, Copy)]
13pub struct Line<'a> {
14    pub level: u8,
15    pub xref: &'a str,
16    pub tag: &'a str,
17    pub value: &'a str,
18}
19
20// impl std::str::FromStr for Line {
21//     // The error must be owned
22//     type Err = String;
23
24//     fn from_str(s: &str) -> Result<Self, Self::Err> {
25//         hex_color.parse(s).map_err(|e| e.to_string())
26//     }
27// }
28
29impl<'b> Line<'b> {
30    pub fn parse(input: &mut &'b str) -> PResult<Line<'b>> {
31        let mut line = Line {
32            level: 0,
33            xref: "",
34            tag: "",
35            value: "",
36        };
37        // println!("Parsing line...");
38        // println!("Starting input: '{}'", input);
39        if !input.is_empty() {
40            // We could rewrite this into a sequence of parsers, something like this:
41            // let (level, _, xref, _, tag, delim, value) = (
42            //     Self::level,
43            //     Self::delim,
44            //     Self::xref,
45            //     Self::delim,
46            //     Self::tag,
47            //     Self::delim,
48            //     Self::value,
49            // )
50            //     .parse_next(input).unwrap();
51
52            let level = Self::level(input);
53            match level {
54                Ok(lvl) => {
55                    line.level = lvl;
56                    let _ = Self::delim(input);
57                    // println!("Input after level: '{}'", input);
58                    match Self::xref(input) {
59                        Ok(xref) => {
60                            line.xref = xref;
61                        }
62                        Err(_e) => {
63                            todo!();
64                        }
65                    }
66                    if !line.xref.is_empty() {
67                        let _ = Self::delim(input);
68                    }
69                    line.tag = Self::tag(input)?;
70                    let _ = Self::delim(input);
71
72                    // println!("Input: '{}'", input);
73                    let is_eol = Self::peek_eol(input)?;
74                    // println!("EOL: {}", is_eol);
75                    if is_eol {
76                        // println!("Eating eol");
77                        Self::eol(input).unwrap();
78                        // let _ = Self::delim(input);
79                    } else {
80                        // println!("eating delim");
81                        // Self::eol(input).unwrap();
82                        Self::delim(input).unwrap();
83                        line.value = Self::value(input)?;
84                        // println!("Input after value: '{}'", input);
85
86                        let is_eol = Self::peek_eol(input)?;
87                        if is_eol {
88                            Self::eol(input).unwrap();
89                        }
90                    }
91                }
92                Err(e) => {
93                    println!("Err: {}", e);
94                    println!("Error parsing line: '{}'", input);
95                    Self::eol(input).unwrap();
96                    /*
97                    There's a case where a line is simply the extension of the
98                    previous line because of an embedded newline. This is common
99                    in Ancestry source data, IME. Technically, it's incorrect
100                    according to spec; the data should use a CONC/CONT to indicate
101                    a break on a new line.
102
103                    What we can attempt to do is parse the line as the value, as
104                    if it were a CONCatonation. We don't have a line level, nor
105                    do we know what the previous line is, so we'll set it to
106                    u8::MAX, I guess, and add a special use-case for that.
107                     */
108
109                    // line.level = u8::MAX;
110                    // line.tag = "CONC";
111                    // line.value = Self::value(input)?;
112                    // println!("New value: '{:?}'", line);
113
114                    // there's a case where the value of a line contains a newline,
115                    // breaking it into its own line. I think it's techically
116                    // invalid, according to spec; it should use CONC/CONT.
117                    // It's common in Ancestry source data so may as well work
118                    // to handle it.
119                }
120            }
121        }
122        // println!("ending input: '{}'", input);
123        // println!("done. {:?}", line);
124        Ok(line)
125    }
126
127    /// Peek ahead at the next line without consuming it.
128    pub fn peek(input: &mut &'b str) -> PResult<Line<'b>> {
129        let start = input.checkpoint();
130        let line = Line::parse(input).unwrap();
131
132        input.reset(start);
133        Ok(line)
134    }
135
136    /// Parse a number from the string, but return it as an actual Rust number, not a string.
137    fn level(input: &mut &str) -> PResult<u8> {
138        // parse_to works because it uses FromStr, which is effectively
139        // a convienence function around try_map
140        // digit1.try_map(str::parse).parse_next(input)
141        digit1
142            .context(StrContext::Label("level"))
143            .parse_to()
144            .parse_next(input)
145    }
146
147    /// Parse a number from the string, but return it as an actual Rust number, not a string.
148    // fn peek_level<'s>(input: &mut &'s str) -> PResult<u8> {
149    //     let start = input.checkpoint();
150
151    //     let level = Self::level(input).unwrap();
152    //     input.reset(start);
153    //     Ok(level)
154    // }
155
156    /// Parse the delimiter
157    fn delim(input: &mut &'b str) -> PResult<&'b str> {
158        space0.context(StrContext::Label("delim")).parse_next(input)
159    }
160
161    fn eol(input: &mut &'b str) -> PResult<&'b str> {
162        // multispace0.context(StrContext::Label("eol2")).parse_next(input)
163        line_ending
164            .context(StrContext::Label("eol"))
165            .parse_next(input)
166
167        // println!("EOL start input: '{}'", input);
168        // let res = line_ending.context(StrContext::Label("eol")).parse_next(input);
169        // println!("EOL end input: '{}'", input);
170
171        // res
172    }
173
174    /// Peek at the next character to see if it's a newline
175    fn peek_eol(input: &mut &'b str) -> PResult<bool> {
176        if input.starts_with('\n') || input.starts_with("\r\n") {
177            return Ok(true);
178        }
179
180        // let start = input.checkpoint();
181        // let res = Self::eol(input);
182        // input.reset(start);
183
184        // if !res.is_err() {
185        //     let is_eol = res.unwrap();
186        //     return Ok(!is_eol.is_empty());
187        // }
188        Ok(false)
189        // let is_eol = Self::eol(input).unwrap();
190
191        // input.reset(start);
192        // Ok(!is_eol.is_empty())
193    }
194
195    fn tag(input: &mut &'b str) -> PResult<&'b str> {
196        // one of: a-zA-Z_
197        let parser = preceded(opt(tag("_")), alphanumeric1)
198            .recognize()
199            .verify(|o: &str| o.len() <= 31);
200
201        parser.context(StrContext::Label("tag")).parse_next(input)
202    }
203
204    fn value(input: &mut &'b str) -> PResult<&'b str> {
205        not_line_ending
206            .context(StrContext::Label("value"))
207            .parse_next(input)
208    }
209
210    /// Parse the xref, if present
211    ///
212    /// TODO: Return the leading/trailing @ portion of the xref
213    fn xref(input: &mut &'b str) -> PResult<&'b str> {
214        if input.starts_with('@') {
215            let mut parser =
216                separated_pair(tag("@"), take_till(0.., |c| c == '@'), tag("@")).recognize();
217            return parser.parse_next(input);
218
219            // println!("Parsing xref: '{}'", input);
220            // let mut parser = delimited(tag("@"), take_till(0.., |c| c == '@'), tag("@"));
221            // let res = parser.context(StrContext::Label("xref")).parse_next(input);
222
223            // if !res.is_err() {
224            //     let mut xref = res.unwrap();
225            //     xref += "@";
226            //     return Ok("@1@");
227            // }
228            // take_till(1.., |c| c == '@').parse_next(input)
229            // let mut parser = delimited(
230            //     tag("@"),
231            //     is_not("@"),
232            //     tag("@"),
233            // );
234
235            // parser(input)
236        }
237        Ok("")
238    }
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244
245    #[test]
246    fn parse_lines() {
247        let mut data = vec![
248            "0 HEAD",
249            "1 CHAR UTF-8",
250            "1 SOUR Ancestry.com Family Trees",
251            "2 DATA Name of source data",
252            "3 DATE 1 JAN 1998",
253            "3 COPR Copyright of source data",
254            "1 SUBM @U1@",
255            "0 @U1@ SUBM",
256        ];
257
258        let line = Line::parse(&mut data[0]).unwrap();
259        assert!(line.level == 0 && line.tag == "HEAD");
260
261        let line = Line::parse(&mut data[1]).unwrap();
262        assert!(line.level == 1 && line.tag == "CHAR" && line.value == "UTF-8");
263
264        let line = Line::parse(&mut data[2]).unwrap();
265        assert!(line.level == 1 && line.tag == "SOUR" && line.value == "Ancestry.com Family Trees");
266
267        let line = Line::parse(&mut data[3]).unwrap();
268        assert!(line.level == 2 && line.tag == "DATA" && line.value == "Name of source data");
269
270        let line = Line::parse(&mut data[4]).unwrap();
271        assert!(line.level == 3 && line.tag == "DATE" && line.value == "1 JAN 1998");
272
273        let line = Line::parse(&mut data[5]).unwrap();
274        assert!(line.level == 3 && line.tag == "COPR" && line.value == "Copyright of source data");
275
276        let line = Line::parse(&mut data[6]).unwrap();
277        assert!(line.level == 1 && line.tag == "SUBM" && line.value == "@U1@");
278
279        let line = Line::parse(&mut data[7]).unwrap();
280        // TODO: Update this to include the wrapping @ when I figure out how to make nom do that.
281        assert!(line.level == 0 && line.tag == "SUBM" && line.value == "" && line.xref == "@U1@");
282    }
283}