gedcom_rs/parse.rs
1// use crate::types::{Address, Line, Source};
2// use super::types::Line;
3use super::types::*;
4
5use std::fs::File;
6
7use std::io::{self, BufRead};
8use std::path::Path;
9
10use winnow::prelude::*;
11
12/// This is pretty much a kludge to strip out U+FEFF, a Zero Width No-Break Space
13/// https://www.compart.com/en/unicode/U+FEFF
14///
15/// So far, I've only seen this with one GEDCOM, as the starting byte.
16// pub fn zero_with_no_break_space(input: &mut &str) -> PResult<&str> {
17// if input.starts_with('\u{FEFF}') {
18// let parser = tag("\u{FEFF}");
19
20// parser.parse_next(input)
21// } else {
22// Ok("")
23// }
24// }
25
26/// What did I mean to do with this? gg
27/// I think it takes the input and returns a tuple containing the tag and it's
28/// optional value? I lost the thread, though, and need to retrace my steps.
29// fn get_tag_value(input: &str) -> IResult<&str, (&str, &str)> {
30
31// Ok((input, ("", "")))
32// }
33
34/// Read the next tag's value and any continuations
35pub fn get_tag_value(input: &mut &str) -> PResult<Option<String>> {
36 let mut line = Line::parse(input).unwrap();
37
38 // Seed the value with the initial value
39 let mut text: String = line.value.to_string();
40
41 line = Line::peek(input).unwrap();
42 while line.tag == "CONC" || line.tag == "CONT" {
43 // consume
44 line = Line::parse(input).unwrap();
45
46 if line.tag == "CONT" {
47 text += "\n";
48 } else {
49 text += " ";
50 }
51 text += line.value;
52
53 // peek ahead
54 line = Line::peek(input).unwrap();
55 }
56
57 Ok(Some(text))
58}
59
60/// Parse the buffer if the CONC tag is found and return the resulting string.
61// pub fn conc(input: &mut &str) -> PResult<Option<String>> {
62// let line = Line::parse(input).unwrap();
63
64// if line.tag == "CONC" {
65// Ok(Some(line.value.to_string()))
66// } else {
67// Ok(None)
68// }
69// }
70
71/// Parse the buffer if the CONT tag is found and return the resulting string.
72/// TODO: Refactor this. It should handle CONT and CONC.
73// pub fn cont(input: &mut &str) -> PResult<Option<String>> {
74// let line = Line::parse(input).unwrap();
75
76// if line.tag == "CONT" {
77// Ok(Some(line.value.to_string()))
78// } else {
79// Ok(None)
80// }
81// }
82
83/// Parse a GEDCOM file
84pub fn parse_gedcom(filename: &str) -> Gedcom {
85 // Initialize an empty gedcom
86 let mut gedcom = Gedcom {
87 header: Header {
88 encoding: None,
89 copyright: None,
90 date: None,
91 destination: None,
92 gedcom_version: None,
93 language: None,
94 filename: None,
95 note: None,
96 source: None,
97 submitter: None,
98 submission: None,
99 },
100 individuals: vec![],
101 };
102
103 if let Ok(lines) = read_lines(filename) {
104 // Consumes the iterator, returns an (Optional) String
105
106 // Read through the lines and build a buffer of <records>, each starting
107 // with a zero and ending with the last line before the next. Then feed that
108 // buffer to a nom parser to split it into Lines?
109
110 // This is kind of like a buffered read, specific to the GEDCOM format
111 // We read into the buffer until we hit a new record, and then parse that
112 // record into a struct.
113 let mut record: String = String::new();
114
115 for mut buffer in lines.flatten() {
116 // Strip off any leading Zero Width No-Break Space
117 if buffer.strip_prefix('\u{FEFF}').is_some() {
118 buffer.remove(0);
119 }
120 // println!("Buffer: \n'{}'", buffer);
121 // record = buffer.clone() + "\n";
122
123 if let Some(ch) = buffer.chars().next() {
124 if ch == '0' && !record.is_empty() {
125 let mut input: &str = record.as_str();
126
127 // Peek at the first line in the record so we know how
128 // to parse it.
129 let line = Line::peek(&mut input).unwrap();
130 // println!("Got a line: {:?}", line);
131 match line.tag {
132 "HEAD" => {
133 // println!("Parsing HEAD: \n{}", input);
134 gedcom.header = Header::parse(input.to_string());
135 }
136 "INDI" => {
137 // let indi = Individual::parse(buff.to_string());
138 // // TODO: Remove the if. This is just to clean up the output for debugging.
139 // if indi.xref.clone().unwrap() == "@I1@" {
140 // gedcom.individuals.push(indi);
141 // }
142 }
143 "SOUR" => {}
144 "REPO" => {}
145 "OBJE" => {
146 // let obj = Object::parse(buff);
147 // println!("{:?}", obj);
148 }
149 "FAM" => {}
150 "SUBM" => {
151 // // The record of the submitter of the family tree
152 // // Not always present (it exists in complete.ged)
153 // if let Some(ref subm) = gedcom.header.submitter {
154 // if let Some(xref) = &subm.xref {
155 // gedcom.header.submitter =
156 // Submitter::find_by_xref(buff, xref.to_string());
157 // }
158 // }
159 }
160 _ => {}
161 };
162
163 record.clear();
164 }
165 record = record + &buffer.clone() + "\n";
166 // println!("Record: {:?}", record);
167 }
168
169 // match Line::peek(&mut linebuff) {
170 // Ok(line) => {
171 // if line.level == 0 && line.tag == "HEAD" {
172 // // Consume the line
173 // Line::parse(&mut linebuff).unwrap();
174 // } else if line.level == 1 {
175 // // println!("Found an inner tag: {}", line.tag);
176 // match line.tag {
177 // "CHAR" => {
178 // gedcom.header.encoding = Some(line.value.to_string());
179 // Line::parse(&mut linebuff).unwrap();
180 // }
181 // "INDI" => {
182 // let indi = Individual::parse(buff.to_string());
183 // // TODO: Remove the if. This is just to clean up the output for debugging.
184 // if indi.xref.clone().unwrap() == "@I1@" {
185 // gedcom.individuals.push(indi);
186 // }
187 // }
188 // "SOUR" => {}
189 // "REPO" => {}
190 // "OBJE" => {
191 // let obj = Object::parse(buff);
192 // println!("{:?}", obj);
193 // }
194 // "FAM" => {}
195 // "SUBM" => {
196 // // The record of the submitter of the family tree
197 // // Not always present (it exists in complete.ged)
198 // if let Some(ref subm) = gedcom.header.submitter {
199 // if let Some(xref) = &subm.xref {
200 // gedcom.header.submitter =
201 // Submitter::find_by_xref(buff, xref.to_string());
202 // }
203 // }
204 // }
205 // _ => {
206 // // println!("Unhandled header tag: {}", line.tag);
207 // // (_, _) = Line::parse(&buffer).unwrap();
208 // }
209 // };
210 // // } else {
211 // // (_, _) = Line::parse(&buffer).unwrap();
212 // }
213
214 // // println!("line: {:?}", line);
215
216 // }
217 // Err(_e) => {
218 // println!("Error parsing line: '{}'", buffer);
219 // }
220 // }
221
222 // if let Some(ch) = buffer.chars().next() {
223 // if ch == '0' && !record.is_empty() {
224 // // We found a new record, beginning with buffer, so
225 // // process the data in `record` before continuing
226
227 // // Peek at the next line to see where we're at.
228 // // let (buff, line) = Line::peek(&record).unwrap();
229 // let line = Line::peek(record).unwrap();
230
231 // match line.tag {
232 // "HEAD" => {
233 // gedcom.header = Header::parse(buff.to_string());
234 // }
235 // "INDI" => {
236 // let indi = Individual::parse(buff.to_string());
237 // // TODO: Remove the if. This is just to clean up the output for debugging.
238 // if indi.xref.clone().unwrap() == "@I1@" {
239 // gedcom.individuals.push(indi);
240 // }
241 // }
242 // "SOUR" => {}
243 // "REPO" => {}
244 // "OBJE" => {
245 // let obj = Object::parse(buff);
246 // println!("{:?}", obj);
247 // }
248 // "FAM" => {}
249 // "SUBM" => {
250 // // The record of the submitter of the family tree
251 // // Not always present (it exists in complete.ged)
252 // if let Some(ref subm) = gedcom.header.submitter {
253 // if let Some(xref) = &subm.xref {
254 // gedcom.header.submitter =
255 // Submitter::find_by_xref(buff, xref.to_string());
256 // }
257 // }
258 // }
259 // _ => {}
260 // };
261
262 // record.clear();
263 // }
264 // }
265 // record = record + &buffer.clone() + "\n";
266 }
267 // TODO: families
268 // TODO: repositories
269 // TODO: sources
270 // TODO: multimedia
271 }
272 gedcom
273}
274
275// The output is wrapped in a Result to allow matching on errors
276// Returns an Iterator to the Reader of the lines of the file.
277// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
278fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
279where
280 P: AsRef<Path>,
281{
282 let file = File::open(filename)?;
283 Ok(io::BufReader::new(file).lines())
284}
285
286#[cfg(test)]
287mod tests {
288 use super::*;
289
290 #[test]
291 fn parse_get_tag_value() {
292 let mut input = "3 ADDR 1300 West Traverse Parkway\n4 CONT Lehi, UT 84043\n4 CONC USA";
293 let output = "1300 West Traverse Parkway\nLehi, UT 84043 USA";
294
295 let res = get_tag_value(&mut input).unwrap();
296 if let Some(value) = res {
297 assert!(output == value);
298 }
299 assert!(input.len() == 0);
300 }
301}