maddi_xml/
lib.rs

1// SPDX-FileCopyrightText: 2025 Madeline Baggins <declanbaggins@gmail.com>
2//
3// SPDX-License-Identifier: MIT
4
5use std::{
6    collections::HashMap,
7    num::{IntErrorKind, ParseIntError},
8    path::{Path, PathBuf},
9    str::FromStr,
10};
11
12#[derive(Clone)]
13pub struct Parser<'a> {
14    tail: &'a str,
15    pub position: Position<'a>,
16}
17
18#[derive(Debug, Clone)]
19pub struct Position<'a> {
20    pub path: &'a Path,
21    pub src: &'a str,
22    pub line: usize,
23    pub char: usize,
24}
25
26impl<'a> Position<'a> {
27    pub fn error(&self, message: String) -> Error<'a> {
28        Error {
29            message,
30            position: self.clone(),
31        }
32    }
33}
34
35pub type Result<'a, T> = std::result::Result<T, Error<'a>>;
36
37#[derive(Debug)]
38pub struct Error<'a> {
39    pub message: String,
40    pub position: Position<'a>,
41}
42
43impl std::fmt::Display for Error<'_> {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        const RED: &str = "\x1b[1;31m";
46        const DEFAULT: &str = "\x1b[1;39m";
47        writeln!(
48            f,
49            "{RED}Error in '{}':{DEFAULT}",
50            self.position.path.display()
51        )?;
52        for (line_num, line) in self.position.src.split('\n').enumerate() {
53            writeln!(f, "{line}")?;
54            if line_num == self.position.line {
55                let offset = std::iter::repeat_n(' ', self.position.char).collect::<String>();
56                writeln!(f, "{offset}^")?;
57                let offset_len = self.position.char.saturating_sub(self.message.len());
58                let offset = std::iter::repeat_n(' ', offset_len).collect::<String>();
59                writeln!(f, "{offset}{RED}{}{DEFAULT}", self.message)?;
60            }
61        }
62        Ok(())
63    }
64}
65
66impl<'a> Parser<'a> {
67    pub fn new(path: &'a Path, src: &'a str) -> Self {
68        Self {
69            tail: src,
70            position: Position {
71                src,
72                path,
73                line: 0,
74                char: 0,
75            },
76        }
77    }
78    pub fn parse<T: Parse<'a>>(&mut self) -> T {
79        T::parse(self)
80    }
81    fn take_whitespace(&mut self) {
82        let len = self
83            .tail
84            .find(|c: char| !c.is_whitespace())
85            .unwrap_or(self.tail.len());
86        self.take(len);
87    }
88    fn take_char(&mut self) -> Option<char> {
89        let char = self.tail.chars().next()?;
90        match char {
91            '\n' => {
92                self.position.line += 1;
93                self.position.char = 0;
94            }
95            _ => self.position.char += 1,
96        }
97        (_, self.tail) = self.tail.split_at(char.len_utf8());
98        Some(char)
99    }
100    fn take(&mut self, n: usize) -> &'a str {
101        let head;
102        (head, self.tail) = self.tail.split_at(n);
103        for c in head.chars() {
104            match c {
105                '\n' => {
106                    self.position.line += 1;
107                    self.position.char = 0;
108                }
109                _ => self.position.char += 1,
110            }
111        }
112        head
113    }
114}
115
116pub trait Parse<'a> {
117    fn parse(parser: &mut Parser<'a>) -> Self;
118}
119
120#[derive(Debug)]
121pub enum Content<'a> {
122    Element(Element<'a>),
123    Text(String),
124}
125
126impl<'a> Parse<'a> for Option<Result<'a, Content<'a>>> {
127    fn parse(parser: &mut Parser<'a>) -> Self {
128        // Clear any whitespace
129        parser.take_whitespace();
130        // If the document has finished parsing
131        if parser.tail.is_empty() {
132            return None;
133        };
134        // Check if we start with an element
135        match parser.parse::<Option<Result<Element>>>() {
136            Some(Ok(element)) => return Some(Ok(Content::Element(element))),
137            Some(Err(err)) => return Some(Err(err)),
138            None => {}
139        }
140        // Otherwise, get the text
141        let len = parser.tail.find('<').unwrap_or(parser.tail.len());
142        let text = parser.take(len);
143        Some(Ok(Content::Text(text.into())))
144    }
145}
146
147#[derive(Debug)]
148pub struct Element<'a> {
149    pub name: &'a str,
150    pub attributes: HashMap<&'a str, Attribute<'a>>,
151    pub contents: Vec<Content<'a>>,
152    pub position: Position<'a>,
153}
154
155impl<'a> Element<'a> {
156    pub fn attribute<'b, T: Query<'a, 'b>>(&'b self, name: &str) -> Result<'a, T> {
157        T::get(name, self)
158    }
159
160    pub fn children<'b, 'c, T: FromElement<'a, 'b>>(
161        &'b self,
162        name: &'c str,
163    ) -> impl Iterator<Item = Result<'a, T>> + use<'a, 'b, 'c, T> {
164        use Content;
165        self.contents
166            .iter()
167            .filter_map(move |item| match item {
168                Content::Element(e) if e.name == name => Some(e),
169                _ => None,
170            })
171            .map(|t| T::from_element(t))
172    }
173    pub fn child<'b, 'c, T: FromElement<'a, 'b>>(&'b self, name: &'c str) -> Result<'a, T> {
174        use Content;
175        let mut candidates = self.contents.iter().filter_map(move |item| match item {
176            Content::Element(e) if e.name == name => Some(e),
177            _ => None,
178        });
179        let Some(result) = candidates.next() else {
180            return Err(self.position.error(format!("expected '{name}' element")));
181        };
182        if let Some(duplicate) = candidates.next() {
183            return Err(duplicate
184                .position
185                .error(format!("duplicate '{name}' element")));
186        }
187        T::from_element(result)
188    }
189    pub fn optional_child<'b, 'c, T: FromElement<'a, 'b>>(
190        &'b self,
191        name: &'c str,
192    ) -> Result<'a, Option<T>> {
193        use Content;
194        let mut candidates = self.contents.iter().filter_map(move |item| match item {
195            Content::Element(e) if e.name == name => Some(e),
196            _ => None,
197        });
198        let Some(result) = candidates.next() else {
199            return Ok(None);
200        };
201        if let Some(duplicate) = candidates.next() {
202            return Err(duplicate
203                .position
204                .error(format!("duplicate '{name}' element")));
205        }
206        Some(T::from_element(result)).transpose()
207    }
208}
209
210impl<'a> Parse<'a> for Option<Result<'a, Element<'a>>> {
211    fn parse(parser: &mut Parser<'a>) -> Self {
212        // Find the opening tag if there is one
213        let open_tag = match parser.parse::<Option<Result<OpenTag>>>()? {
214            Ok(open_tag) => open_tag,
215            Err(err) => return Some(Err(err)),
216        };
217        // If the tag was self closing, return the entity
218        let mut contents = vec![];
219        if open_tag.self_closing {
220            return Some(Ok(Element {
221                name: open_tag.name,
222                position: open_tag.position,
223                attributes: open_tag.attributes,
224                contents,
225            }));
226        }
227        // Parse all the content
228        let close_tag = loop {
229            // Remove any whitespace
230            parser.take_whitespace();
231            // Check if there's a closing tag
232            if let Some(close_tag) = parser.parse::<Option<Result<CloseTag>>>() {
233                break close_tag;
234            }
235            // Otherwise, try to get content
236            match parser.parse::<Option<Result<Content>>>() {
237                Some(Err(err)) => return Some(Err(err)),
238                Some(Ok(content)) => contents.push(content),
239                None => return Some(Err(parser.position.error("missing closing tag".into()))),
240            }
241        };
242        // Ensure we didn't error getting the close tag
243        let close_tag = match close_tag {
244            Ok(close_tag) => close_tag,
245            Err(err) => return Some(Err(err)),
246        };
247        // Ensure the close and open tags match
248        if open_tag.name != close_tag.name {
249            return Some(Err(parser.position.error("mismatched closing tag".into())));
250        }
251        Some(Ok(Element {
252            name: open_tag.name,
253            attributes: open_tag.attributes,
254            contents,
255            position: open_tag.position,
256        }))
257    }
258}
259
260/// The name of an element.
261/// - Must start with a letter or underscore.
262/// - Cannot start with the letters "xml" in any case.
263/// - Consists only of letters, digits, hyphens,
264///   underscores, and periods.
265struct Name<'a>(&'a str);
266
267impl<'a> Parse<'a> for Option<Name<'a>> {
268    fn parse(parser: &mut Parser<'a>) -> Self {
269        // Ensure tail starts with a letter or underscore
270        if !parser
271            .tail
272            .starts_with(|c: char| c.is_alphabetic() || c == '_')
273        {
274            return None;
275        }
276        // Ensure tail doesn't start with 'xml' in any case
277        if parser
278            .tail
279            .get(0..3)
280            .is_some_and(|f| f.to_lowercase() == "xml")
281        {
282            return None;
283        }
284        // Find the head of the tail that only consists of
285        // digits, hyphens, underscores, and periods.
286        let len = parser
287            .tail
288            .find(|c: char| !c.is_ascii_alphanumeric() && !['.', '_', '-'].contains(&c))
289            .unwrap_or(parser.tail.len());
290        let name = parser.tail.get(..len).unwrap();
291        (!name.is_empty()).then_some(Name(parser.take(len)))
292    }
293}
294
295struct OpenTag<'a> {
296    name: &'a str,
297    attributes: HashMap<&'a str, Attribute<'a>>,
298    self_closing: bool,
299    position: Position<'a>,
300}
301
302impl<'a> Parse<'a> for Option<Result<'a, OpenTag<'a>>> {
303    fn parse(parser: &mut Parser<'a>) -> Self {
304        // Ensure we're parsing an open tag
305        if !parser.tail.starts_with('<') {
306            return None;
307        }
308        // Skip over the opening chevron
309        parser.take(1);
310        // Get the element's name
311        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
312            return Some(Err(parser.position.error("expected element name".into())));
313        };
314        // Skip any whitespace
315        parser.take_whitespace();
316        // Parse any attributes
317        let mut attributes = HashMap::new();
318        while let Some(attribute) = parser.parse::<Option<Result<Attribute>>>() {
319            match attribute {
320                Ok(attribute) => {
321                    if let Some(old) = attributes.insert(attribute.name, attribute) {
322                        let duplicate = attributes.get(old.name).unwrap();
323                        return Some(Err(duplicate
324                            .position
325                            .error(format!("found duplicate '{}' attribute", old.name))));
326                    }
327                }
328                Err(e) => return Some(Err(e)),
329            }
330            parser.take_whitespace();
331        }
332        // Ensure the opening tag ends with '/>' or '>'.
333        let self_closing = parser.tail.starts_with("/>");
334        if !self_closing && !parser.tail.starts_with(">") {
335            return Some(Err(parser.position.error("expected '>' or '/>'".into())));
336        }
337        // Skip the ending bit
338        if self_closing {
339            parser.take("/>".len());
340        } else {
341            parser.take(">".len());
342        }
343        // Build the opening tag
344        Some(Ok(OpenTag {
345            name,
346            attributes,
347            self_closing,
348            position: parser.position.clone(),
349        }))
350    }
351}
352
353#[derive(Debug)]
354pub struct Attribute<'a> {
355    pub name: &'a str,
356    pub value: Option<String>,
357    pub position: Position<'a>,
358}
359
360impl<'a> Parse<'a> for Option<Result<'a, Attribute<'a>>> {
361    fn parse(parser: &mut Parser<'a>) -> Self {
362        // Clone the parser in case we need to restore it
363        let backup = parser.clone();
364        // Get the name of the attribute
365        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
366            *parser = backup;
367            return None;
368        };
369        // If there's no value to the attribute, finish
370        // parsing.
371        if !parser.tail.starts_with('=') {
372            return Some(Ok(Attribute {
373                name,
374                value: None,
375                position: parser.position.clone(),
376            }));
377        }
378        // Skip the '='
379        parser.take(1);
380        // Parse the value of the attribute
381        let Some(AttributeValue(value)) = parser.parse::<Option<AttributeValue>>() else {
382            return Some(Err(parser
383                .position
384                .error("expected attribute value".into())));
385        };
386        Some(Ok(Attribute {
387            name,
388            value: Some(value),
389            position: parser.position.clone(),
390        }))
391    }
392}
393
394struct AttributeValue(String);
395
396impl Parse<'_> for Option<AttributeValue> {
397    fn parse(parser: &mut Parser) -> Self {
398        // Ensure the parser starts with a single or double
399        // quote.
400        let quote = match parser.tail.chars().next()? {
401            c @ ('"' | '\'') => c,
402            _ => return None,
403        };
404        // Create a working copy of the parser
405        let mut working = parser.clone();
406        working.take(1);
407        // Build out the string
408        // TODO: Add support for character entities
409        let mut value = String::new();
410        loop {
411            let next = working.take_char()?;
412            match next {
413                '\\' => match working.take_char()? {
414                    c @ ('\\' | '\'' | '"') => value.push(c),
415                    _ => return None,
416                },
417                c if c == quote => break,
418                c => value.push(c),
419            }
420        }
421        // Save the working copy of the parser
422        *parser = working;
423        Some(AttributeValue(value))
424    }
425}
426
427struct CloseTag<'a> {
428    name: &'a str,
429}
430
431impl<'a> Parse<'a> for Option<Result<'a, CloseTag<'a>>> {
432    fn parse(parser: &mut Parser<'a>) -> Self {
433        // Ensure we're at the start of a closing tag
434        if !parser.tail.starts_with("</") {
435            return None;
436        }
437        parser.take("</".len());
438        // Get the name of the closing tag
439        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
440            return Some(Err(parser.position.error("expected element name".into())));
441        };
442        // Ensure we end with a '>'.
443        if !parser.tail.starts_with('>') {
444            return Some(Err(parser.position.error("expected '>'".into())));
445        }
446        // Skip the '>'.
447        parser.take(">".len());
448        Some(Ok(CloseTag { name }))
449    }
450}
451
452pub trait FromValue<'a, 'b>: Sized {
453    fn from_value(value: &'b str, position: &'b Position<'a>) -> Result<'a, Self>;
454}
455
456impl<'a, 'b> FromValue<'a, 'b> for &'b str {
457    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
458        Ok(value)
459    }
460}
461
462impl<'a, 'b> FromValue<'a, 'b> for &'b Path {
463    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
464        Ok(value.as_ref())
465    }
466}
467
468impl<'a, 'b> FromValue<'a, 'b> for String {
469    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
470        Ok(value.into())
471    }
472}
473
474impl<'a, 'b> FromValue<'a, 'b> for PathBuf {
475    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
476        Ok(PathBuf::from(value))
477    }
478}
479
480pub trait FromNumeric: FromStr<Err = ParseIntError> {}
481
482impl FromNumeric for u8 {}
483
484impl FromNumeric for u16 {}
485
486impl FromNumeric for u32 {}
487
488impl FromNumeric for u64 {}
489
490impl FromNumeric for u128 {}
491
492impl FromNumeric for usize {}
493
494impl FromNumeric for i8 {}
495
496impl FromNumeric for i16 {}
497
498impl FromNumeric for i32 {}
499
500impl FromNumeric for i64 {}
501
502impl FromNumeric for i128 {}
503
504impl FromNumeric for isize {}
505
506impl<'a, 'b, T> FromValue<'a, 'b> for T
507where
508    T: FromNumeric,
509{
510    fn from_value(value: &'b str, position: &'b Position<'a>) -> Result<'a, Self> {
511        value.parse::<T>().map_err(|e| {
512            let msg = match e.kind() {
513                IntErrorKind::Empty => "failed to parse integer from empty string",
514                IntErrorKind::InvalidDigit => "value contains invalid digit",
515                IntErrorKind::PosOverflow => "value too large for this attribute",
516                IntErrorKind::NegOverflow => "value too small for this attribute",
517                IntErrorKind::Zero => "value cannot be zero for this attribute",
518                _ => "unknown integer parse error",
519            }
520            .into();
521            position.error(msg)
522        })
523    }
524}
525
526pub trait FromAttribute<'a, 'b>: Sized {
527    fn from_attribute(attribute: &'b Attribute<'a>) -> Result<'a, Self>;
528}
529
530impl<'a, 'b, T: FromValue<'a, 'b>> FromAttribute<'a, 'b> for T {
531    fn from_attribute(attribute: &'b Attribute<'a>) -> Result<'a, Self> {
532        let Some(value) = attribute.value.as_ref() else {
533            let name = attribute.name;
534            return Err(attribute
535                .position
536                .error(format!("expected non-empty value for '{name}'")));
537        };
538        T::from_value(value, &attribute.position)
539    }
540}
541
542pub trait Query<'a, 'b>: Sized {
543    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self>;
544}
545
546impl<'a, 'b, T: FromAttribute<'a, 'b>> Query<'a, 'b> for T {
547    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
548        let Some(attribute) = element.attributes.get(name) else {
549            let msg = format!("expected '{name}' attribute");
550            return Err(element.position.error(msg));
551        };
552        T::from_attribute(attribute)
553    }
554}
555
556impl<'a, 'b, T: FromAttribute<'a, 'b>> Query<'a, 'b> for Option<T> {
557    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
558        element
559            .attributes
560            .get(name)
561            .map(|a| T::from_attribute(a))
562            .transpose()
563    }
564}
565
566impl<'a, 'b> Query<'a, 'b> for bool {
567    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
568        Ok(element.attributes.contains_key(name))
569    }
570}
571
572pub trait FromElement<'a, 'b>: Sized {
573    fn from_element(element: &'b Element<'a>) -> Result<'a, Self>;
574}
575
576impl<'a, 'b> FromElement<'a, 'b> for &'b Element<'a> {
577    fn from_element(element: &'b Element<'a>) -> Result<'a, Self> {
578        Ok(element)
579    }
580}
581
582impl<'a, 'b, T> FromElement<'a, 'b> for T
583where
584    T: FromValue<'a, 'b>,
585{
586    fn from_element(element: &'b Element<'a>) -> Result<'a, Self> {
587        match element.contents.as_slice() {
588            [Content::Text(value)] => T::from_value(value, &element.position),
589            _ => Err(element
590                .position
591                .error("expected element to contain a single value".into())),
592        }
593    }
594}