gedcom_rs/types/line.rs
1// use std::str::FromStr;
2
3use winnow::ascii::{alphanumeric1, digit1, line_ending, not_line_ending, space0};
4use winnow::combinator::{opt, preceded, separated_pair};
5use winnow::error::StrContext;
6use winnow::prelude::*;
7use winnow::stream::Stream;
8use winnow::token::{tag, take_till};
9
10/// A GEDCOM line
11/// level + delim (space) + [optional_xref_ID] + tag + [optional_line_value] + terminator
12#[derive(Debug, Eq, PartialEq, Clone, Copy)]
13pub struct Line<'a> {
14 pub level: u8,
15 pub xref: &'a str,
16 pub tag: &'a str,
17 pub value: &'a str,
18}
19
20// impl std::str::FromStr for Line {
21// // The error must be owned
22// type Err = String;
23
24// fn from_str(s: &str) -> Result<Self, Self::Err> {
25// hex_color.parse(s).map_err(|e| e.to_string())
26// }
27// }
28
29impl<'b> Line<'b> {
30 pub fn parse(input: &mut &'b str) -> PResult<Line<'b>> {
31 let mut line = Line {
32 level: 0,
33 xref: "",
34 tag: "",
35 value: "",
36 };
37 // println!("Parsing line...");
38 // println!("Starting input: '{}'", input);
39 if !input.is_empty() {
40 // We could rewrite this into a sequence of parsers, something like this:
41 // let (level, _, xref, _, tag, delim, value) = (
42 // Self::level,
43 // Self::delim,
44 // Self::xref,
45 // Self::delim,
46 // Self::tag,
47 // Self::delim,
48 // Self::value,
49 // )
50 // .parse_next(input).unwrap();
51
52 let level = Self::level(input);
53 match level {
54 Ok(lvl) => {
55 line.level = lvl;
56 let _ = Self::delim(input);
57 // println!("Input after level: '{}'", input);
58 match Self::xref(input) {
59 Ok(xref) => {
60 line.xref = xref;
61 }
62 Err(_e) => {
63 todo!();
64 }
65 }
66 if !line.xref.is_empty() {
67 let _ = Self::delim(input);
68 }
69 line.tag = Self::tag(input)?;
70 let _ = Self::delim(input);
71
72 // println!("Input: '{}'", input);
73 let is_eol = Self::peek_eol(input)?;
74 // println!("EOL: {}", is_eol);
75 if is_eol {
76 // println!("Eating eol");
77 Self::eol(input).unwrap();
78 // let _ = Self::delim(input);
79 } else {
80 // println!("eating delim");
81 // Self::eol(input).unwrap();
82 Self::delim(input).unwrap();
83 line.value = Self::value(input)?;
84 // println!("Input after value: '{}'", input);
85
86 let is_eol = Self::peek_eol(input)?;
87 if is_eol {
88 Self::eol(input).unwrap();
89 }
90 }
91 }
92 Err(e) => {
93 println!("Err: {}", e);
94 println!("Error parsing line: '{}'", input);
95 Self::eol(input).unwrap();
96 /*
97 There's a case where a line is simply the extension of the
98 previous line because of an embedded newline. This is common
99 in Ancestry source data, IME. Technically, it's incorrect
100 according to spec; the data should use a CONC/CONT to indicate
101 a break on a new line.
102
103 What we can attempt to do is parse the line as the value, as
104 if it were a CONCatonation. We don't have a line level, nor
105 do we know what the previous line is, so we'll set it to
106 u8::MAX, I guess, and add a special use-case for that.
107 */
108
109 // line.level = u8::MAX;
110 // line.tag = "CONC";
111 // line.value = Self::value(input)?;
112 // println!("New value: '{:?}'", line);
113
114 // there's a case where the value of a line contains a newline,
115 // breaking it into its own line. I think it's techically
116 // invalid, according to spec; it should use CONC/CONT.
117 // It's common in Ancestry source data so may as well work
118 // to handle it.
119 }
120 }
121 }
122 // println!("ending input: '{}'", input);
123 // println!("done. {:?}", line);
124 Ok(line)
125 }
126
127 /// Peek ahead at the next line without consuming it.
128 pub fn peek(input: &mut &'b str) -> PResult<Line<'b>> {
129 let start = input.checkpoint();
130 let line = Line::parse(input).unwrap();
131
132 input.reset(start);
133 Ok(line)
134 }
135
136 /// Parse a number from the string, but return it as an actual Rust number, not a string.
137 fn level(input: &mut &str) -> PResult<u8> {
138 // parse_to works because it uses FromStr, which is effectively
139 // a convienence function around try_map
140 // digit1.try_map(str::parse).parse_next(input)
141 digit1
142 .context(StrContext::Label("level"))
143 .parse_to()
144 .parse_next(input)
145 }
146
147 /// Parse a number from the string, but return it as an actual Rust number, not a string.
148 // fn peek_level<'s>(input: &mut &'s str) -> PResult<u8> {
149 // let start = input.checkpoint();
150
151 // let level = Self::level(input).unwrap();
152 // input.reset(start);
153 // Ok(level)
154 // }
155
156 /// Parse the delimiter
157 fn delim(input: &mut &'b str) -> PResult<&'b str> {
158 space0.context(StrContext::Label("delim")).parse_next(input)
159 }
160
161 fn eol(input: &mut &'b str) -> PResult<&'b str> {
162 // multispace0.context(StrContext::Label("eol2")).parse_next(input)
163 line_ending
164 .context(StrContext::Label("eol"))
165 .parse_next(input)
166
167 // println!("EOL start input: '{}'", input);
168 // let res = line_ending.context(StrContext::Label("eol")).parse_next(input);
169 // println!("EOL end input: '{}'", input);
170
171 // res
172 }
173
174 /// Peek at the next character to see if it's a newline
175 fn peek_eol(input: &mut &'b str) -> PResult<bool> {
176 if input.starts_with('\n') || input.starts_with("\r\n") {
177 return Ok(true);
178 }
179
180 // let start = input.checkpoint();
181 // let res = Self::eol(input);
182 // input.reset(start);
183
184 // if !res.is_err() {
185 // let is_eol = res.unwrap();
186 // return Ok(!is_eol.is_empty());
187 // }
188 Ok(false)
189 // let is_eol = Self::eol(input).unwrap();
190
191 // input.reset(start);
192 // Ok(!is_eol.is_empty())
193 }
194
195 fn tag(input: &mut &'b str) -> PResult<&'b str> {
196 // one of: a-zA-Z_
197 let parser = preceded(opt(tag("_")), alphanumeric1)
198 .recognize()
199 .verify(|o: &str| o.len() <= 31);
200
201 parser.context(StrContext::Label("tag")).parse_next(input)
202 }
203
204 fn value(input: &mut &'b str) -> PResult<&'b str> {
205 not_line_ending
206 .context(StrContext::Label("value"))
207 .parse_next(input)
208 }
209
210 /// Parse the xref, if present
211 ///
212 /// TODO: Return the leading/trailing @ portion of the xref
213 fn xref(input: &mut &'b str) -> PResult<&'b str> {
214 if input.starts_with('@') {
215 let mut parser =
216 separated_pair(tag("@"), take_till(0.., |c| c == '@'), tag("@")).recognize();
217 return parser.parse_next(input);
218
219 // println!("Parsing xref: '{}'", input);
220 // let mut parser = delimited(tag("@"), take_till(0.., |c| c == '@'), tag("@"));
221 // let res = parser.context(StrContext::Label("xref")).parse_next(input);
222
223 // if !res.is_err() {
224 // let mut xref = res.unwrap();
225 // xref += "@";
226 // return Ok("@1@");
227 // }
228 // take_till(1.., |c| c == '@').parse_next(input)
229 // let mut parser = delimited(
230 // tag("@"),
231 // is_not("@"),
232 // tag("@"),
233 // );
234
235 // parser(input)
236 }
237 Ok("")
238 }
239}
240
241#[cfg(test)]
242mod tests {
243 use super::*;
244
245 #[test]
246 fn parse_lines() {
247 let mut data = vec![
248 "0 HEAD",
249 "1 CHAR UTF-8",
250 "1 SOUR Ancestry.com Family Trees",
251 "2 DATA Name of source data",
252 "3 DATE 1 JAN 1998",
253 "3 COPR Copyright of source data",
254 "1 SUBM @U1@",
255 "0 @U1@ SUBM",
256 ];
257
258 let line = Line::parse(&mut data[0]).unwrap();
259 assert!(line.level == 0 && line.tag == "HEAD");
260
261 let line = Line::parse(&mut data[1]).unwrap();
262 assert!(line.level == 1 && line.tag == "CHAR" && line.value == "UTF-8");
263
264 let line = Line::parse(&mut data[2]).unwrap();
265 assert!(line.level == 1 && line.tag == "SOUR" && line.value == "Ancestry.com Family Trees");
266
267 let line = Line::parse(&mut data[3]).unwrap();
268 assert!(line.level == 2 && line.tag == "DATA" && line.value == "Name of source data");
269
270 let line = Line::parse(&mut data[4]).unwrap();
271 assert!(line.level == 3 && line.tag == "DATE" && line.value == "1 JAN 1998");
272
273 let line = Line::parse(&mut data[5]).unwrap();
274 assert!(line.level == 3 && line.tag == "COPR" && line.value == "Copyright of source data");
275
276 let line = Line::parse(&mut data[6]).unwrap();
277 assert!(line.level == 1 && line.tag == "SUBM" && line.value == "@U1@");
278
279 let line = Line::parse(&mut data[7]).unwrap();
280 // TODO: Update this to include the wrapping @ when I figure out how to make nom do that.
281 assert!(line.level == 0 && line.tag == "SUBM" && line.value == "" && line.xref == "@U1@");
282 }
283}