gedcom_rs/
parse.rs

1// use crate::types::{Address, Line, Source};
2// use super::types::Line;
3use super::types::*;
4
5use std::fs::File;
6
7use std::io::{self, BufRead};
8use std::path::Path;
9
10use winnow::prelude::*;
11
12/// This is pretty much a kludge to strip out U+FEFF, a Zero Width No-Break Space
13/// https://www.compart.com/en/unicode/U+FEFF
14///
15/// So far, I've only seen this with one GEDCOM, as the starting byte.
16// pub fn zero_with_no_break_space(input: &mut &str) -> PResult<&str> {
17//     if input.starts_with('\u{FEFF}') {
18//         let parser = tag("\u{FEFF}");
19
20//         parser.parse_next(input)
21//     } else {
22//         Ok("")
23//     }
24// }
25
26/// What did I mean to do with this? gg
27/// I think it takes the input and returns a tuple containing the tag and it's
28/// optional value? I lost the thread, though, and need to retrace my steps.
29// fn get_tag_value(input: &str) -> IResult<&str, (&str, &str)> {
30
31//     Ok((input, ("", "")))
32// }
33
34/// Read the next tag's value and any continuations
35pub fn get_tag_value(input: &mut &str) -> PResult<Option<String>> {
36    let mut line = Line::parse(input).unwrap();
37
38    // Seed the value with the initial value
39    let mut text: String = line.value.to_string();
40
41    line = Line::peek(input).unwrap();
42    while line.tag == "CONC" || line.tag == "CONT" {
43        // consume
44        line = Line::parse(input).unwrap();
45
46        if line.tag == "CONT" {
47            text += "\n";
48        } else {
49            text += " ";
50        }
51        text += line.value;
52
53        // peek ahead
54        line = Line::peek(input).unwrap();
55    }
56
57    Ok(Some(text))
58}
59
60/// Parse the buffer if the CONC tag is found and return the resulting string.
61// pub fn conc(input: &mut &str) -> PResult<Option<String>> {
62//     let line = Line::parse(input).unwrap();
63
64//     if line.tag == "CONC" {
65//         Ok(Some(line.value.to_string()))
66//     } else {
67//         Ok(None)
68//     }
69// }
70
71/// Parse the buffer if the CONT tag is found and return the resulting string.
72/// TODO: Refactor this. It should handle CONT and CONC.
73// pub fn cont(input: &mut &str) -> PResult<Option<String>> {
74//     let line = Line::parse(input).unwrap();
75
76//     if line.tag == "CONT" {
77//         Ok(Some(line.value.to_string()))
78//     } else {
79//         Ok(None)
80//     }
81// }
82
83/// Parse a GEDCOM file
84pub fn parse_gedcom(filename: &str) -> Gedcom {
85    // Initialize an empty gedcom
86    let mut gedcom = Gedcom {
87        header: Header {
88            encoding: None,
89            copyright: None,
90            date: None,
91            destination: None,
92            gedcom_version: None,
93            language: None,
94            filename: None,
95            note: None,
96            source: None,
97            submitter: None,
98            submission: None,
99        },
100        individuals: vec![],
101    };
102
103    if let Ok(lines) = read_lines(filename) {
104        // Consumes the iterator, returns an (Optional) String
105
106        // Read through the lines and build a buffer of <records>, each starting
107        // with a zero and ending with the last line before the next. Then feed that
108        // buffer to a nom parser to split it into Lines?
109
110        // This is kind of like a buffered read, specific to the GEDCOM format
111        // We read into the buffer until we hit a new record, and then parse that
112        // record into a struct.
113        let mut record: String = String::new();
114
115        for mut buffer in lines.flatten() {
116            // Strip off any leading Zero Width No-Break Space
117            if buffer.strip_prefix('\u{FEFF}').is_some() {
118                buffer.remove(0);
119            }
120            // println!("Buffer: \n'{}'", buffer);
121            // record = buffer.clone() + "\n";
122
123            if let Some(ch) = buffer.chars().next() {
124                if ch == '0' && !record.is_empty() {
125                    let mut input: &str = record.as_str();
126
127                    // Peek at the first line in the record so we know how
128                    // to parse it.
129                    let line = Line::peek(&mut input).unwrap();
130                    // println!("Got a line: {:?}", line);
131                    match line.tag {
132                        "HEAD" => {
133                            // println!("Parsing HEAD: \n{}", input);
134                            gedcom.header = Header::parse(input.to_string());
135                        }
136                        "INDI" => {
137                            // let indi = Individual::parse(buff.to_string());
138                            // // TODO: Remove the if. This is just to clean up the output for debugging.
139                            // if indi.xref.clone().unwrap() == "@I1@" {
140                            //     gedcom.individuals.push(indi);
141                            // }
142                        }
143                        "SOUR" => {}
144                        "REPO" => {}
145                        "OBJE" => {
146                            // let obj = Object::parse(buff);
147                            // println!("{:?}", obj);
148                        }
149                        "FAM" => {}
150                        "SUBM" => {
151                            // // The record of the submitter of the family tree
152                            // // Not always present (it exists in complete.ged)
153                            // if let Some(ref subm) = gedcom.header.submitter {
154                            //     if let Some(xref) = &subm.xref {
155                            //         gedcom.header.submitter =
156                            //             Submitter::find_by_xref(buff, xref.to_string());
157                            //     }
158                            // }
159                        }
160                        _ => {}
161                    };
162
163                    record.clear();
164                }
165                record = record + &buffer.clone() + "\n";
166                // println!("Record: {:?}", record);
167            }
168
169            // match Line::peek(&mut linebuff) {
170            //     Ok(line) => {
171            //         if line.level == 0 && line.tag == "HEAD" {
172            //             // Consume the line
173            //             Line::parse(&mut linebuff).unwrap();
174            //         } else if line.level == 1 {
175            //             // println!("Found an inner tag: {}", line.tag);
176            //             match line.tag {
177            //                 "CHAR" => {
178            //                     gedcom.header.encoding = Some(line.value.to_string());
179            //                     Line::parse(&mut linebuff).unwrap();
180            //                 }
181            //                 "INDI" => {
182            //                     let indi = Individual::parse(buff.to_string());
183            //                     // TODO: Remove the if. This is just to clean up the output for debugging.
184            //                     if indi.xref.clone().unwrap() == "@I1@" {
185            //                         gedcom.individuals.push(indi);
186            //                     }
187            //                 }
188            //                 "SOUR" => {}
189            //                 "REPO" => {}
190            //                 "OBJE" => {
191            //                     let obj = Object::parse(buff);
192            //                     println!("{:?}", obj);
193            //                 }
194            //                 "FAM" => {}
195            //                 "SUBM" => {
196            //                     // The record of the submitter of the family tree
197            //                     // Not always present (it exists in complete.ged)
198            //                     if let Some(ref subm) = gedcom.header.submitter {
199            //                         if let Some(xref) = &subm.xref {
200            //                             gedcom.header.submitter =
201            //                                 Submitter::find_by_xref(buff, xref.to_string());
202            //                         }
203            //                     }
204            //                 }
205            //                 _ => {
206            //                     // println!("Unhandled header tag: {}", line.tag);
207            //                     // (_, _) = Line::parse(&buffer).unwrap();
208            //                 }
209            //             };
210            //         // } else {
211            //         //     (_, _) = Line::parse(&buffer).unwrap();
212            //         }
213
214            //         // println!("line: {:?}", line);
215
216            //     }
217            //     Err(_e) => {
218            //         println!("Error parsing line: '{}'", buffer);
219            //     }
220            // }
221
222            // if let Some(ch) = buffer.chars().next() {
223            //     if ch == '0' && !record.is_empty() {
224            //         // We found a new record, beginning with buffer, so
225            //         // process the data in `record` before continuing
226
227            //         // Peek at the next line to see where we're at.
228            //         // let (buff, line) = Line::peek(&record).unwrap();
229            //         let line = Line::peek(record).unwrap();
230
231            //         match line.tag {
232            //             "HEAD" => {
233            //                 gedcom.header = Header::parse(buff.to_string());
234            //             }
235            //             "INDI" => {
236            //                 let indi = Individual::parse(buff.to_string());
237            //                 // TODO: Remove the if. This is just to clean up the output for debugging.
238            //                 if indi.xref.clone().unwrap() == "@I1@" {
239            //                     gedcom.individuals.push(indi);
240            //                 }
241            //             }
242            //             "SOUR" => {}
243            //             "REPO" => {}
244            //             "OBJE" => {
245            //                 let obj = Object::parse(buff);
246            //                 println!("{:?}", obj);
247            //             }
248            //             "FAM" => {}
249            //             "SUBM" => {
250            //                 // The record of the submitter of the family tree
251            //                 // Not always present (it exists in complete.ged)
252            //                 if let Some(ref subm) = gedcom.header.submitter {
253            //                     if let Some(xref) = &subm.xref {
254            //                         gedcom.header.submitter =
255            //                             Submitter::find_by_xref(buff, xref.to_string());
256            //                     }
257            //                 }
258            //             }
259            //             _ => {}
260            //         };
261
262            //         record.clear();
263            //     }
264            // }
265            // record = record + &buffer.clone() + "\n";
266        }
267        // TODO: families
268        // TODO: repositories
269        // TODO: sources
270        // TODO: multimedia
271    }
272    gedcom
273}
274
275// The output is wrapped in a Result to allow matching on errors
276// Returns an Iterator to the Reader of the lines of the file.
277// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
278fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
279where
280    P: AsRef<Path>,
281{
282    let file = File::open(filename)?;
283    Ok(io::BufReader::new(file).lines())
284}
285
286#[cfg(test)]
287mod tests {
288    use super::*;
289
290    #[test]
291    fn parse_get_tag_value() {
292        let mut input = "3 ADDR 1300 West Traverse Parkway\n4 CONT Lehi, UT 84043\n4 CONC USA";
293        let output = "1300 West Traverse Parkway\nLehi, UT 84043 USA";
294
295        let res = get_tag_value(&mut input).unwrap();
296        if let Some(value) = res {
297            assert!(output == value);
298        }
299        assert!(input.len() == 0);
300    }
301}