maddi_xml/
lib.rs

1// SPDX-FileCopyrightText: 2025 Madeline Baggins <declanbaggins@gmail.com>
2//
3// SPDX-License-Identifier: MIT
4
5use std::{
6    collections::HashMap,
7    num::{IntErrorKind, ParseIntError},
8    path::{Path, PathBuf},
9    str::FromStr,
10};
11
12#[derive(Clone)]
13pub struct Parser<'a> {
14    tail: &'a str,
15    pub position: Position<'a>,
16}
17
18#[derive(Debug, Clone)]
19pub struct Position<'a> {
20    pub path: &'a Path,
21    pub src: &'a str,
22    pub line: usize,
23    pub char: usize,
24}
25
26impl<'a> Position<'a> {
27    pub fn error(&self, message: String) -> Error<'a> {
28        Error {
29            message,
30            position: self.clone(),
31        }
32    }
33}
34
35pub type Result<'a, T> = std::result::Result<T, Error<'a>>;
36
37#[derive(Debug)]
38pub struct Error<'a> {
39    pub message: String,
40    pub position: Position<'a>,
41}
42
43impl std::fmt::Display for Error<'_> {
44    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45        const RED: &str = "\x1b[1;31m";
46        const DEFAULT: &str = "\x1b[1;39m";
47        writeln!(
48            f,
49            "{RED}Error in '{}':{DEFAULT}",
50            self.position.path.display()
51        )?;
52        for (line_num, line) in self.position.src.split('\n').enumerate() {
53            writeln!(f, "{line}")?;
54            if line_num == self.position.line {
55                let offset = std::iter::repeat_n(' ', self.position.char).collect::<String>();
56                writeln!(f, "{offset}^")?;
57                let offset_len = self.position.char.saturating_sub(self.message.len());
58                let offset = std::iter::repeat_n(' ', offset_len).collect::<String>();
59                writeln!(f, "{offset}{RED}{}{DEFAULT}", self.message)?;
60            }
61        }
62        Ok(())
63    }
64}
65
66impl<'a> Parser<'a> {
67    pub fn new(path: &'a Path, src: &'a str) -> Self {
68        Self {
69            tail: src,
70            position: Position {
71                src,
72                path,
73                line: 0,
74                char: 0,
75            },
76        }
77    }
78    pub fn parse<T: Parse<'a>>(&mut self) -> T {
79        T::parse(self)
80    }
81    fn take_whitespace(&mut self) {
82        let len = self
83            .tail
84            .find(|c: char| !c.is_whitespace())
85            .unwrap_or(self.tail.len());
86        self.take(len);
87    }
88    fn take_char(&mut self) -> Option<char> {
89        let char = self.tail.chars().next()?;
90        match char {
91            '\n' => {
92                self.position.line += 1;
93                self.position.char = 0;
94            }
95            _ => self.position.char += 1,
96        }
97        (_, self.tail) = self.tail.split_at(char.len_utf8());
98        Some(char)
99    }
100    fn take(&mut self, n: usize) -> &'a str {
101        let head;
102        (head, self.tail) = self.tail.split_at(n);
103        for c in head.chars() {
104            match c {
105                '\n' => {
106                    self.position.line += 1;
107                    self.position.char = 0;
108                }
109                _ => self.position.char += 1,
110            }
111        }
112        head
113    }
114}
115
116pub trait Parse<'a> {
117    fn parse(parser: &mut Parser<'a>) -> Self;
118}
119
120#[derive(Debug)]
121pub enum Content<'a> {
122    Element(Element<'a>),
123    Text(String),
124}
125
126impl<'a> Parse<'a> for Option<Result<'a, Content<'a>>> {
127    fn parse(parser: &mut Parser<'a>) -> Self {
128        // Clear any whitespace
129        parser.take_whitespace();
130        // If the document has finished parsing
131        if parser.tail.is_empty() {
132            return None;
133        };
134        // Check if we start with an element
135        match parser.parse::<Option<Result<Element>>>() {
136            Some(Ok(element)) => return Some(Ok(Content::Element(element))),
137            Some(Err(err)) => return Some(Err(err)),
138            None => {}
139        }
140        // Otherwise, get the text
141        let len = parser.tail.find('<').unwrap_or(parser.tail.len());
142        let text = parser.take(len);
143        Some(Ok(Content::Text(text.into())))
144    }
145}
146
147#[derive(Debug)]
148pub struct Element<'a> {
149    pub name: &'a str,
150    pub attributes: HashMap<&'a str, Attribute<'a>>,
151    pub contents: Vec<Content<'a>>,
152    pub position: Position<'a>,
153}
154
155impl<'a> Element<'a> {
156    pub fn attribute<'b, T: Query<'a, 'b>>(&'b self, name: &str) -> Result<'a, T> {
157        T::get(name, self)
158    }
159
160    pub fn children<'b, 'c, T: FromElement<'a, 'b>>(
161        &'b self,
162        name: &'c str,
163    ) -> impl Iterator<Item = Result<'a, T>> + use<'a, 'b, 'c, T> {
164        use Content;
165        self.contents
166            .iter()
167            .filter_map(move |item| match item {
168                Content::Element(e) if e.name == name => Some(e),
169                _ => None,
170            })
171            .map(|t| T::from_element(t))
172    }
173}
174
175impl<'a> Parse<'a> for Option<Result<'a, Element<'a>>> {
176    fn parse(parser: &mut Parser<'a>) -> Self {
177        // Find the opening tag if there is one
178        let open_tag = match parser.parse::<Option<Result<OpenTag>>>()? {
179            Ok(open_tag) => open_tag,
180            Err(err) => return Some(Err(err)),
181        };
182        // If the tag was self closing, return the entity
183        let mut contents = vec![];
184        if open_tag.self_closing {
185            return Some(Ok(Element {
186                name: open_tag.name,
187                position: open_tag.position,
188                attributes: open_tag.attributes,
189                contents,
190            }));
191        }
192        // Parse all the content
193        let close_tag = loop {
194            // Remove any whitespace
195            parser.take_whitespace();
196            // Check if there's a closing tag
197            if let Some(close_tag) = parser.parse::<Option<Result<CloseTag>>>() {
198                break close_tag;
199            }
200            // Otherwise, try to get content
201            match parser.parse::<Option<Result<Content>>>() {
202                Some(Err(err)) => return Some(Err(err)),
203                Some(Ok(content)) => contents.push(content),
204                None => return Some(Err(parser.position.error("missing closing tag".into()))),
205            }
206        };
207        // Ensure we didn't error getting the close tag
208        let close_tag = match close_tag {
209            Ok(close_tag) => close_tag,
210            Err(err) => return Some(Err(err)),
211        };
212        // Ensure the close and open tags match
213        if open_tag.name != close_tag.name {
214            return Some(Err(parser.position.error("mismatched closing tag".into())));
215        }
216        Some(Ok(Element {
217            name: open_tag.name,
218            attributes: open_tag.attributes,
219            contents,
220            position: open_tag.position,
221        }))
222    }
223}
224
225/// The name of an element.
226/// - Must start with a letter or underscore.
227/// - Cannot start with the letters "xml" in any case.
228/// - Consists only of letters, digits, hyphens,
229///   underscores, and periods.
230struct Name<'a>(&'a str);
231
232impl<'a> Parse<'a> for Option<Name<'a>> {
233    fn parse(parser: &mut Parser<'a>) -> Self {
234        // Ensure tail starts with a letter or underscore
235        if !parser
236            .tail
237            .starts_with(|c: char| c.is_alphabetic() || c == '_')
238        {
239            return None;
240        }
241        // Ensure tail doesn't start with 'xml' in any case
242        if parser
243            .tail
244            .get(0..3)
245            .is_some_and(|f| f.to_lowercase() == "xml")
246        {
247            return None;
248        }
249        // Find the head of the tail that only consists of
250        // digits, hyphens, underscores, and periods.
251        let len = parser
252            .tail
253            .find(|c: char| !c.is_ascii_alphanumeric() && !['.', '_', '-'].contains(&c))
254            .unwrap_or(parser.tail.len());
255        let name = parser.tail.get(..len).unwrap();
256        (!name.is_empty()).then_some(Name(parser.take(len)))
257    }
258}
259
260struct OpenTag<'a> {
261    name: &'a str,
262    attributes: HashMap<&'a str, Attribute<'a>>,
263    self_closing: bool,
264    position: Position<'a>,
265}
266
267impl<'a> Parse<'a> for Option<Result<'a, OpenTag<'a>>> {
268    fn parse(parser: &mut Parser<'a>) -> Self {
269        // Ensure we're parsing an open tag
270        if !parser.tail.starts_with('<') {
271            return None;
272        }
273        // Skip over the opening chevron
274        parser.take(1);
275        // Get the element's name
276        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
277            return Some(Err(parser.position.error("expected element name".into())));
278        };
279        // Skip any whitespace
280        parser.take_whitespace();
281        // Parse any attributes
282        let mut attributes = HashMap::new();
283        while let Some(attribute) = parser.parse::<Option<Result<Attribute>>>() {
284            match attribute {
285                Ok(attribute) => {
286                    if let Some(old) = attributes.insert(attribute.name, attribute) {
287                        let duplicate = attributes.get(old.name).unwrap();
288                        return Some(Err(duplicate
289                            .position
290                            .error(format!("found duplicate '{}' attribute", old.name))));
291                    }
292                }
293                Err(e) => return Some(Err(e)),
294            }
295            parser.take_whitespace();
296        }
297        // Ensure the opening tag ends with '/>' or '>'.
298        let self_closing = parser.tail.starts_with("/>");
299        if !self_closing && !parser.tail.starts_with(">") {
300            return Some(Err(parser.position.error("expected '>' or '/>'".into())));
301        }
302        // Skip the ending bit
303        if self_closing {
304            parser.take("/>".len());
305        } else {
306            parser.take(">".len());
307        }
308        // Build the opening tag
309        Some(Ok(OpenTag {
310            name,
311            attributes,
312            self_closing,
313            position: parser.position.clone(),
314        }))
315    }
316}
317
318#[derive(Debug)]
319pub struct Attribute<'a> {
320    pub name: &'a str,
321    pub value: Option<String>,
322    pub position: Position<'a>,
323}
324
325impl<'a> Parse<'a> for Option<Result<'a, Attribute<'a>>> {
326    fn parse(parser: &mut Parser<'a>) -> Self {
327        // Clone the parser in case we need to restore it
328        let backup = parser.clone();
329        // Get the name of the attribute
330        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
331            *parser = backup;
332            return None;
333        };
334        // If there's no value to the attribute, finish
335        // parsing.
336        if !parser.tail.starts_with('=') {
337            return Some(Ok(Attribute {
338                name,
339                value: None,
340                position: parser.position.clone(),
341            }));
342        }
343        // Skip the '='
344        parser.take(1);
345        // Parse the value of the attribute
346        let Some(AttributeValue(value)) = parser.parse::<Option<AttributeValue>>() else {
347            return Some(Err(parser
348                .position
349                .error("expected attribute value".into())));
350        };
351        Some(Ok(Attribute {
352            name,
353            value: Some(value),
354            position: parser.position.clone(),
355        }))
356    }
357}
358
359struct AttributeValue(String);
360
361impl Parse<'_> for Option<AttributeValue> {
362    fn parse(parser: &mut Parser) -> Self {
363        // Ensure the parser starts with a single or double
364        // quote.
365        let quote = match parser.tail.chars().next()? {
366            c @ ('"' | '\'') => c,
367            _ => return None,
368        };
369        // Create a working copy of the parser
370        let mut working = parser.clone();
371        working.take(1);
372        // Build out the string
373        // TODO: Add support for character entities
374        let mut value = String::new();
375        loop {
376            let next = working.take_char()?;
377            match next {
378                '\\' => match working.take_char()? {
379                    c @ ('\\' | '\'' | '"') => value.push(c),
380                    _ => return None,
381                },
382                c if c == quote => break,
383                c => value.push(c),
384            }
385        }
386        // Save the working copy of the parser
387        *parser = working;
388        Some(AttributeValue(value))
389    }
390}
391
392struct CloseTag<'a> {
393    name: &'a str,
394}
395
396impl<'a> Parse<'a> for Option<Result<'a, CloseTag<'a>>> {
397    fn parse(parser: &mut Parser<'a>) -> Self {
398        // Ensure we're at the start of a closing tag
399        if !parser.tail.starts_with("</") {
400            return None;
401        }
402        parser.take("</".len());
403        // Get the name of the closing tag
404        let Some(Name(name)) = parser.parse::<Option<Name>>() else {
405            return Some(Err(parser.position.error("expected element name".into())));
406        };
407        // Ensure we end with a '>'.
408        if !parser.tail.starts_with('>') {
409            return Some(Err(parser.position.error("expected '>'".into())));
410        }
411        // Skip the '>'.
412        parser.take(">".len());
413        Some(Ok(CloseTag { name }))
414    }
415}
416
417pub trait FromValue<'a, 'b>: Sized {
418    fn from_value(value: &'b str, position: &'b Position<'a>) -> Result<'a, Self>;
419}
420
421impl<'a, 'b> FromValue<'a, 'b> for &'b str {
422    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
423        Ok(value)
424    }
425}
426
427impl<'a, 'b> FromValue<'a, 'b> for &'b Path {
428    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
429        Ok(value.as_ref())
430    }
431}
432
433impl<'a, 'b> FromValue<'a, 'b> for String {
434    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
435        Ok(value.into())
436    }
437}
438
439impl<'a, 'b> FromValue<'a, 'b> for PathBuf {
440    fn from_value(value: &'b str, _position: &'b Position<'a>) -> Result<'a, Self> {
441        Ok(PathBuf::from(value))
442    }
443}
444
445pub trait FromNumeric: FromStr<Err = ParseIntError> {}
446
447impl FromNumeric for u8 {}
448
449impl FromNumeric for u16 {}
450
451impl FromNumeric for u32 {}
452
453impl FromNumeric for u64 {}
454
455impl FromNumeric for u128 {}
456
457impl FromNumeric for usize {}
458
459impl FromNumeric for i8 {}
460
461impl FromNumeric for i16 {}
462
463impl FromNumeric for i32 {}
464
465impl FromNumeric for i64 {}
466
467impl FromNumeric for i128 {}
468
469impl FromNumeric for isize {}
470
471impl<'a, 'b, T> FromValue<'a, 'b> for T
472where
473    T: FromNumeric,
474{
475    fn from_value(value: &'b str, position: &'b Position<'a>) -> Result<'a, Self> {
476        value.parse::<T>().map_err(|e| {
477            let msg = match e.kind() {
478                IntErrorKind::Empty => "failed to parse integer from empty string",
479                IntErrorKind::InvalidDigit => "value contains invalid digit",
480                IntErrorKind::PosOverflow => "value too large for this attribute",
481                IntErrorKind::NegOverflow => "value too small for this attribute",
482                IntErrorKind::Zero => "value cannot be zero for this attribute",
483                _ => "unknown integer parse error",
484            }
485            .into();
486            position.error(msg)
487        })
488    }
489}
490
491pub trait FromAttribute<'a, 'b>: Sized {
492    fn from_attribute(attribute: &'b Attribute<'a>) -> Result<'a, Self>;
493}
494
495impl<'a, 'b, T: FromValue<'a, 'b>> FromAttribute<'a, 'b> for T {
496    fn from_attribute(attribute: &'b Attribute<'a>) -> Result<'a, Self> {
497        let Some(value) = attribute.value.as_ref() else {
498            let name = attribute.name;
499            return Err(attribute
500                .position
501                .error(format!("expected non-empty value for '{name}'")));
502        };
503        T::from_value(value, &attribute.position)
504    }
505}
506
507pub trait Query<'a, 'b>: Sized {
508    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self>;
509}
510
511impl<'a, 'b, T: FromAttribute<'a, 'b>> Query<'a, 'b> for T {
512    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
513        let Some(attribute) = element.attributes.get(name) else {
514            let msg = format!("expected '{name}' attribute");
515            return Err(element.position.error(msg));
516        };
517        T::from_attribute(attribute)
518    }
519}
520
521impl<'a, 'b, T: FromAttribute<'a, 'b>> Query<'a, 'b> for Option<T> {
522    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
523        element
524            .attributes
525            .get(name)
526            .map(|a| T::from_attribute(a))
527            .transpose()
528    }
529}
530
531impl<'a, 'b> Query<'a, 'b> for bool {
532    fn get(name: &str, element: &'b Element<'a>) -> Result<'a, Self> {
533        Ok(element.attributes.contains_key(name))
534    }
535}
536
537pub trait FromElement<'a, 'b>: Sized {
538    fn from_element(element: &'b Element<'a>) -> Result<'a, Self>;
539}
540
541impl<'a, 'b> FromElement<'a, 'b> for &'b Element<'a> {
542    fn from_element(element: &'b Element<'a>) -> Result<'a, Self> {
543        Ok(element)
544    }
545}
546
547impl<'a, 'b, T> FromElement<'a, 'b> for T
548where
549    T: FromValue<'a, 'b>,
550{
551    fn from_element(element: &'b Element<'a>) -> Result<'a, Self> {
552        match element.contents.as_slice() {
553            [Content::Text(value)] => T::from_value(value, &element.position),
554            _ => Err(element
555                .position
556                .error("expected element to contain a single value".into())),
557        }
558    }
559}