precis_tools/
csv_parser.rs

1use crate::Error;
2use lazy_static::lazy_static;
3use regex::Regex;
4use std::fs::File;
5use std::io::{self, BufRead};
6use std::marker::PhantomData;
7use std::path::{Path, PathBuf};
8use std::str::FromStr;
9use ucd_parse::CodepointRange;
10
11/// A line oriented parser for a particular `UCD` file.
12///
13/// Callers can build a line parser via the
14/// [`UcdFile::from_dir`](trait.UcdFile.html) method.
15///
16/// The `R` type parameter refers to the underlying `io::Read` implementation
17/// from which the `CSV` data is read.
18///
19/// The `D` type parameter refers to the type of the record parsed out of each
20/// line.
21#[derive(Debug)]
22pub struct CsvLineParser<R, D> {
23    path: Option<PathBuf>,
24    rdr: io::BufReader<R>,
25    line: String,
26    line_number: u64,
27    _data: PhantomData<D>,
28}
29
30impl<D> CsvLineParser<File, D> {
31    /// Create a new parser from the given file path.
32    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<CsvLineParser<File, D>, Error> {
33        let path = path.as_ref();
34        let file = File::open(path).map_err(|e| Error {
35            mesg: format!("IO Error: {}", e),
36            line: None,
37            path: Some(path.to_path_buf()),
38        })?;
39        Ok(CsvLineParser::new(Some(path.to_path_buf()), file))
40    }
41}
42
43impl<R: io::Read, D> CsvLineParser<R, D> {
44    /// Create a new parser that parses the reader given.
45    ///
46    /// The type of data parsed is determined when the `parse_next` function
47    /// is called by virtue of the type requested.
48    ///
49    /// Note that the reader is buffered internally, so the caller does not
50    /// need to provide their own buffering.
51    pub(crate) fn new(path: Option<PathBuf>, rdr: R) -> CsvLineParser<R, D> {
52        CsvLineParser {
53            path,
54            rdr: io::BufReader::new(rdr),
55            line: String::new(),
56            line_number: 0,
57            _data: PhantomData,
58        }
59    }
60}
61
62impl<R: io::Read, D: FromStr<Err = Error>> Iterator for CsvLineParser<R, D> {
63    type Item = Result<D, Error>;
64
65    fn next(&mut self) -> Option<Result<D, Error>> {
66        loop {
67            self.line_number += 1;
68            self.line.clear();
69            let n = match self.rdr.read_line(&mut self.line) {
70                Err(err) => {
71                    return Some(Err(Error {
72                        mesg: format!("IO Error: {}", err),
73                        line: None,
74                        path: self.path.clone(),
75                    }))
76                }
77                Ok(n) => n,
78            };
79            if n == 0 {
80                return None;
81            }
82            // First line in the CVS contains the column names. Skip
83            if self.line_number > 1 {
84                break;
85            }
86        }
87        let line_number = self.line_number;
88        Some(self.line.parse().map_err(|mut err: Error| {
89            err.line = Some(line_number);
90            err
91        }))
92    }
93}
94
95/// Represents the derived property value assigned
96/// to an Unicode code point. This value is parsed
97/// from the `CSV` maintained in the `IANA` registry.
98#[derive(Clone, Copy, Debug, Eq, PartialEq)]
99pub enum DerivedProperty {
100    /// Those code points that are allowed to be used in any PRECIS string class.
101    PValid,
102    /// Those code points that are allowed to be used in the `FreeformClass`.
103    /// In practice, the derived property `ID_PVAL` is not used in this
104    /// specification, because every `ID_PVAL` code point is `PVALID`.
105    FreePVal,
106    /// Contextual rule required for `Join_controls` Unicode code points.
107    ContextJ,
108    /// Contextual rule required for Others Unicode code points.
109    ContextO,
110    /// Those code points that are not permitted in any PRECIS string class.
111    Disallowed,
112    /// Those code points that are not allowed to be used in the `IdentifierClass`.
113    /// In practice, the derived property `FREE_DIS` is not used in this
114    /// specification, because every `FREE_DIS` code point is `DISALLOWED`.
115    IdDis,
116    /// Those code points that are not designated in the Unicode Standard.
117    Unassigned,
118}
119
120impl FromStr for DerivedProperty {
121    type Err = Error;
122
123    fn from_str(word: &str) -> Result<DerivedProperty, Error> {
124        if word.eq("PVALID") {
125            Ok(DerivedProperty::PValid)
126        } else if word.eq("FREE_PVAL") {
127            Ok(DerivedProperty::FreePVal)
128        } else if word.eq("CONTEXTJ") {
129            Ok(DerivedProperty::ContextJ)
130        } else if word.eq("CONTEXTO") {
131            Ok(DerivedProperty::ContextO)
132        } else if word.eq("DISALLOWED") {
133            Ok(DerivedProperty::Disallowed)
134        } else if word.eq("ID_DIS") {
135            Ok(DerivedProperty::IdDis)
136        } else if word.eq("UNASSIGNED") {
137            Ok(DerivedProperty::Unassigned)
138        } else {
139            Err(Error {
140                mesg: format!("Invalid derived property: {}", word),
141                line: None,
142                path: None,
143            })
144        }
145    }
146}
147
148fn parse_codepoint_range(s: &str) -> Result<ucd_parse::CodepointRange, Error> {
149    lazy_static! {
150        static ref PARTS: Regex = Regex::new(r"^(?P<start>[A-Z0-9]+)-(?P<end>[A-Z0-9]+)$").unwrap();
151    }
152    let caps = match PARTS.captures(s) {
153        Some(caps) => caps,
154        None => return err!("invalid codepoint range: '{}'", s),
155    };
156
157    let start = caps["start"].parse()?;
158    let end = caps["end"].parse()?;
159
160    Ok(CodepointRange { start, end })
161}
162
163fn parse_codepoints(s: &str) -> Result<ucd_parse::Codepoints, Error> {
164    if s.contains('-') {
165        let range = parse_codepoint_range(s)?;
166        Ok(ucd_parse::Codepoints::Range(range))
167    } else {
168        let cp = s.parse()?;
169        Ok(ucd_parse::Codepoints::Single(cp))
170    }
171}
172
173fn parse_derived_property_tuple(s: &str) -> Result<(DerivedProperty, DerivedProperty), Error> {
174    lazy_static! {
175        static ref PARTS: Regex = Regex::new(r"^(?P<p1>[A-Z_]+)\s+or\s+(?P<p2>[A-Z_]+)$").unwrap();
176    }
177
178    let caps = match PARTS.captures(s) {
179        Some(caps) => caps,
180        None => return err!("invalid properties: '{}'", s),
181    };
182    let p1 = caps["p1"].parse()?;
183    let p2 = caps["p2"].parse()?;
184
185    Ok((p1, p2))
186}
187
188fn parse_derived_properties(s: &str) -> Result<DerivedProperties, Error> {
189    if s.contains(" or ") {
190        let (p1, p2) = parse_derived_property_tuple(s)?;
191        Ok(DerivedProperties::Tuple((p1, p2)))
192    } else {
193        let p = s.parse()?;
194        Ok(DerivedProperties::Single(p))
195    }
196}
197
198fn parse_precis_table_line(
199    line: &str,
200) -> Result<(ucd_parse::Codepoints, DerivedProperties, &str), Error> {
201    let v: Vec<&str> = line.splitn(3, ',').collect();
202    if v.len() != 3 {
203        return Err(Error {
204            mesg: "Error parsing line".to_string(),
205            line: None,
206            path: None,
207        });
208    }
209
210    let cps = parse_codepoints(v[0])?;
211    let props = parse_derived_properties(v[1])?;
212    let desc = v[2];
213
214    Ok((cps, props, desc))
215}
216
217/// Second column in the `precis-tables.csv` file.
218/// Values could be made up of a single derived property
219/// value, or two combined with the `or` word
220#[derive(Clone, Copy, Debug, Eq, PartialEq)]
221pub enum DerivedProperties {
222    /// Column with a single derived property value
223    Single(DerivedProperty),
224    /// Column with two derived property value
225    Tuple((DerivedProperty, DerivedProperty)),
226}
227
228impl FromStr for DerivedProperties {
229    type Err = Error;
230
231    fn from_str(s: &str) -> Result<DerivedProperties, Error> {
232        parse_derived_properties(s)
233    }
234}
235
236/// A single row in the `precis-tables.csv` file.
237#[derive(Clone, Debug, Eq, PartialEq)]
238pub struct PrecisDerivedProperty {
239    /// The code point or code point range for this entry.
240    pub codepoints: ucd_parse::Codepoints,
241    /// The derived properties assigned to the code points in this entry.
242    pub properties: DerivedProperties,
243    /// The property description
244    pub description: String,
245}
246
247impl FromStr for PrecisDerivedProperty {
248    type Err = Error;
249
250    fn from_str(line: &str) -> Result<PrecisDerivedProperty, Error> {
251        let (codepoints, properties, desc) = parse_precis_table_line(line)?;
252        Ok(PrecisDerivedProperty {
253            codepoints,
254            properties,
255            description: desc.to_string(),
256        })
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use crate::csv_parser::*;
263
264    macro_rules! codepoints {
265        ($a:expr, $b:expr) => {{
266            let tmp_start = ucd_parse::Codepoint::from_u32($a).unwrap();
267            let tmp_end = ucd_parse::Codepoint::from_u32($b).unwrap();
268            let tmp_range = ucd_parse::CodepointRange {
269                start: tmp_start,
270                end: tmp_end,
271            };
272            ucd_parse::Codepoints::Range(tmp_range)
273        }};
274        ($a:expr) => {{
275            let tmp_cp = ucd_parse::Codepoint::from_u32($a).unwrap();
276            ucd_parse::Codepoints::Single(tmp_cp)
277        }};
278    }
279
280    #[test]
281    fn derived_property_from_str() {
282        assert!(DerivedProperty::from_str("PVALID").is_ok());
283        assert_eq!(
284            DerivedProperty::from_str("PVALID").unwrap(),
285            DerivedProperty::PValid
286        );
287
288        assert!(DerivedProperty::from_str("FREE_PVAL").is_ok());
289        assert_eq!(
290            DerivedProperty::from_str("FREE_PVAL").unwrap(),
291            DerivedProperty::FreePVal
292        );
293
294        assert!(DerivedProperty::from_str("CONTEXTJ").is_ok());
295        assert_eq!(
296            DerivedProperty::from_str("CONTEXTJ").unwrap(),
297            DerivedProperty::ContextJ
298        );
299
300        assert!(DerivedProperty::from_str("CONTEXTO").is_ok());
301        assert_eq!(
302            DerivedProperty::from_str("CONTEXTO").unwrap(),
303            DerivedProperty::ContextO
304        );
305
306        assert!(DerivedProperty::from_str("DISALLOWED").is_ok());
307        assert_eq!(
308            DerivedProperty::from_str("DISALLOWED").unwrap(),
309            DerivedProperty::Disallowed
310        );
311
312        assert!(DerivedProperty::from_str("ID_DIS").is_ok());
313        assert_eq!(
314            DerivedProperty::from_str("ID_DIS").unwrap(),
315            DerivedProperty::IdDis
316        );
317
318        assert!(DerivedProperty::from_str("UNASSIGNED").is_ok());
319        assert_eq!(
320            DerivedProperty::from_str("UNASSIGNED").unwrap(),
321            DerivedProperty::Unassigned
322        );
323
324        assert!(DerivedProperty::from_str("ASDFR").is_err());
325    }
326
327    #[test]
328    fn derived_properties_from_str() {
329        let res = DerivedProperties::from_str("UNASSIGNED");
330        assert!(res.is_ok());
331        assert_eq!(
332            DerivedProperties::Single(DerivedProperty::Unassigned),
333            res.unwrap()
334        );
335
336        let res = DerivedProperties::from_str("ID_DIS or FREE_PVAL");
337        assert!(res.is_ok());
338        assert_eq!(
339            DerivedProperties::Tuple((DerivedProperty::IdDis, DerivedProperty::FreePVal)),
340            res.unwrap()
341        );
342
343        let res = DerivedProperties::from_str("ID_DIS   or   FREE_PVAL");
344        assert!(res.is_ok());
345        assert_eq!(
346            DerivedProperties::Tuple((DerivedProperty::IdDis, DerivedProperty::FreePVal)),
347            res.unwrap()
348        );
349
350        let res = DerivedProperties::from_str("ID_DIS or INVALID");
351        assert!(res.is_err());
352
353        let res = DerivedProperties::from_str("  or ");
354        assert!(res.is_err());
355
356        let res = DerivedProperties::from_str("");
357        assert!(res.is_err());
358
359        let res = DerivedProperties::from_str("INVALID");
360        assert!(res.is_err());
361    }
362
363    #[test]
364    fn codepoints_parse() {
365        let res = parse_codepoints("0141-0148");
366        assert!(res.is_ok());
367        assert_eq!(codepoints!(0x0141, 0x148), res.unwrap());
368
369        let res = parse_codepoints("0141");
370        assert!(res.is_ok());
371        assert_eq!(codepoints!(0x0141), res.unwrap());
372
373        let res = parse_codepoints("ghy0141");
374        assert!(res.is_err());
375
376        let res = parse_codepoints("");
377        assert!(res.is_err());
378
379        let res = parse_codepoints("-0148");
380        assert!(res.is_err());
381
382        let res = parse_codepoints("0148-");
383        assert!(res.is_err());
384
385        let res = parse_codepoints("124-0148-2345");
386        assert!(res.is_err());
387
388        let res = parse_codepoints("123454325460148");
389        assert!(res.is_err());
390    }
391
392    #[test]
393    fn precis_derived_property_from_str() {
394        assert!(PrecisDerivedProperty::from_str("0020,ID_DIS or FREE_PVAL,SPACE").is_ok());
395        assert!(PrecisDerivedProperty::from_str(
396            "0000-001F,DISALLOWED,NULL..INFORMATION SEPARATOR ONE"
397        )
398        .is_ok());
399        assert!(PrecisDerivedProperty::from_str(",ID_DIS or FREE_PVAL,SPACE").is_err());
400        assert!(PrecisDerivedProperty::from_str("0020,,SPACE").is_err());
401        assert!(PrecisDerivedProperty::from_str(",,SPACE").is_err());
402        assert!(PrecisDerivedProperty::from_str("").is_err());
403    }
404}