jaq_json/
xml.rs

1//! XML support.
2use crate::{bstr, Map, Val};
3use alloc::string::{String, ToString};
4use alloc::{borrow::ToOwned, boxed::Box, format, vec::Vec};
5use core::fmt::{self, Formatter};
6use std::io;
7use xmlparser::{ElementEnd, ExternalId, StrSpan, TextPos, Token, Tokenizer};
8
9/// Parse a stream of root XML values.
10pub fn parse_many(s: &str) -> impl Iterator<Item = Result<Val, PError>> + '_ {
11    let mut tokens = Tokenizer::from(s);
12    core::iter::from_fn(move || tokens.next().map(|tk| parse(tk?, &mut tokens)))
13}
14
15/// Prefix and local name of a tag.
16#[derive(Debug)]
17struct Tag<'a>(StrSpan<'a>, StrSpan<'a>);
18
19impl PartialEq for Tag<'_> {
20    fn eq(&self, rhs: &Self) -> bool {
21        (self.0.as_str(), self.1.as_str()) == (rhs.0.as_str(), rhs.1.as_str())
22    }
23}
24
25impl fmt::Display for Tag<'_> {
26    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
27        if !self.0.is_empty() {
28            write!(f, "{}:", self.0)?;
29        }
30        write!(f, "{}", self.1)
31    }
32}
33
34impl Tag<'_> {
35    fn tag_pos(&self, tokens: &Tokenizer) -> TagPos {
36        let pos = tokens.stream().gen_text_pos_from(self.0.start());
37        TagPos(self.to_string(), pos)
38    }
39}
40
41/// Tag and its human-readable position for error reporting.
42#[derive(Debug)]
43pub struct TagPos(String, TextPos);
44
45impl fmt::Display for TagPos {
46    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
47        write!(f, "{} (at {})", self.0, self.1)
48    }
49}
50
51/// Lex error.
52#[derive(Debug)]
53pub struct LError(xmlparser::Error);
54
55/// Parse error.
56#[derive(Debug)]
57pub enum PError {
58    /// Lex error
59    Lex(LError),
60    /// Unmatched closing tag, e.g. `<a></b>`
61    Unmatched(TagPos, TagPos),
62    /// Unclosed tag, e.g. `<a>`
63    Unclosed(TagPos),
64}
65
66impl fmt::Display for PError {
67    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
68        match self {
69            Self::Lex(LError(e)) => e.fmt(f),
70            Self::Unmatched(open, close) => {
71                write!(f, "expected closing tag for {open}, found {close}")
72            }
73            Self::Unclosed(open) => {
74                write!(f, "expected closing tag for {open}, found end of file")
75            }
76        }
77    }
78}
79
80impl From<xmlparser::Error> for PError {
81    fn from(e: xmlparser::Error) -> Self {
82        Self::Lex(LError(e))
83    }
84}
85
86impl std::error::Error for PError {}
87
88fn parse_children(tag: &Tag, tokens: &mut Tokenizer) -> Result<Vec<Val>, PError> {
89    let mut children = Vec::new();
90    loop {
91        let Some(tk) = tokens.next() else {
92            return Err(PError::Unclosed(tag.tag_pos(tokens)));
93        };
94        match tk? {
95            Token::ElementEnd {
96                end: ElementEnd::Close(prefix, local),
97                ..
98            } => {
99                let tag_ = Tag(prefix, local);
100                if *tag == tag_ {
101                    return Ok(children);
102                } else {
103                    Err(PError::Unmatched(tag.tag_pos(tokens), tag_.tag_pos(tokens)))?
104                }
105            }
106            tk => children.push(parse(tk, tokens)?),
107        }
108    }
109}
110
111fn tac(tag: &Tag, tokens: &mut Tokenizer) -> Result<Val, PError> {
112    let mut attrs = Vec::new();
113    let children = loop {
114        // SAFETY: xmlparser returns an error instead of None
115        let tk = tokens.next().unwrap();
116        match tk? {
117            Token::Attribute {
118                prefix,
119                local,
120                value,
121                ..
122            } => attrs.push((
123                Tag(prefix, local).to_string().into(),
124                value.as_str().to_owned().into(),
125            )),
126            Token::ElementEnd { end, .. } => match end {
127                ElementEnd::Open => break Some(parse_children(tag, tokens)?),
128                ElementEnd::Empty => break None,
129                // SAFETY: xmlparser returns an error instead of yielding this
130                ElementEnd::Close(..) => panic!(),
131            },
132            // SAFETY: xmlparser returns an error instead of yielding this
133            _ => panic!(),
134        }
135    };
136    let attrs = if attrs.is_empty() { None } else { Some(attrs) };
137
138    Ok(make_obj([
139        ("t", Some(tag.to_string().into())),
140        ("a", attrs.map(|v| Val::obj(v.into_iter().collect()))),
141        ("c", children.map(|v| v.into_iter().collect())),
142    ]))
143}
144
145fn doctype(name: &str, external: Option<ExternalId>, internal: Option<&str>) -> Val {
146    let external = external.map(|ext| match ext {
147        ExternalId::System(system) => format!("SYSTEM {system}"),
148        ExternalId::Public(pub_id, system) => format!("PUBLIC {pub_id} {system}"),
149    });
150    make_obj([
151        ("name", Some(name.to_owned())),
152        ("external", external),
153        ("internal", internal.map(|s| s.to_owned())),
154    ])
155}
156
157fn make_obj<T: Into<Val>, const N: usize>(arr: [(&str, Option<T>); N]) -> Val {
158    let iter = arr
159        .into_iter()
160        .flat_map(|(k, v)| v.map(|v| (k.to_owned().into(), v.into())));
161    Val::obj(iter.collect())
162}
163
164fn parse(tk: Token, tokens: &mut Tokenizer) -> Result<Val, PError> {
165    let ss_val = |ss: StrSpan| ss.as_str().to_owned().into();
166    let singleton = |k: &str, v| Val::obj(core::iter::once((k.to_string().into(), v)).collect());
167
168    Ok(match tk {
169        Token::Declaration {
170            version,
171            encoding,
172            standalone,
173            ..
174        } => singleton(
175            "xmldecl",
176            make_obj([
177                ("version", Some(ss_val(version))),
178                ("encoding", encoding.map(ss_val)),
179                ("standalone", standalone.map(|b| b.into())),
180            ]),
181        ),
182        Token::ProcessingInstruction {
183            target, content, ..
184        } => singleton(
185            "pi",
186            make_obj([
187                ("target", Some(ss_val(target))),
188                ("content", content.map(ss_val)),
189            ]),
190        ),
191        Token::Cdata { text, .. } => singleton("cdata", ss_val(text)),
192        Token::Comment { text, .. } => singleton("comment", ss_val(text)),
193        Token::ElementStart { prefix, local, .. } => tac(&Tag(prefix, local), tokens)?,
194        Token::Text { text } => ss_val(text),
195        // SAFETY: xmlparser returns an error instead of yielding this
196        Token::Attribute { .. }
197        | Token::DtdEnd { .. }
198        | Token::ElementEnd { .. }
199        | Token::EntityDeclaration { .. } => panic!(),
200        Token::DtdStart {
201            name,
202            external_id,
203            span,
204        } => {
205            let internal = loop {
206                let Some(tk) = tokens.next() else {
207                    let pos = tokens.stream().gen_text_pos_from(span.start());
208                    Err(PError::Unclosed(TagPos("DOCTYPE".into(), pos)))?
209                };
210                if let Token::DtdEnd { span: span_ } = tk? {
211                    break &tokens.stream().span().as_str()[span.end()..span_.start()];
212                }
213            };
214            singleton("doctype", doctype(&name, external_id, Some(internal)))
215        }
216        Token::EmptyDtd {
217            name, external_id, ..
218        } => singleton("doctype", doctype(&name, external_id, None)),
219    })
220}
221
222/// Serialisation error.
223#[derive(Debug)]
224pub enum SError {
225    /// Unknown key with value was found in an object, e.g. `{t: "a", x: 1}`
226    InvalidEntry(&'static str, Val, Val),
227    /// Object with zero or more than one keys found, e.g. `{}`, `{a: 1, b: 2}`
228    SingletonObj(Val),
229}
230
231impl fmt::Display for SError {
232    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
233        match self {
234            Self::InvalidEntry(o, k, v) => {
235                write!(f, "invalid entry in {o} object: {{\"{k}\": {v}}}")
236            }
237            Self::SingletonObj(v) => write!(f, "expected singleton object, found: {v}"),
238        }
239    }
240}
241
242impl std::error::Error for SError {}
243
244/// XML value.
245pub enum Xml<S> {
246    /// XML declaration, e.g. `<?xml version='1.0' encoding='UTF-8' standalone='yes'?>`
247    XmlDecl(Vec<(S, S)>),
248    /// DOCTYPE directive, e.g. `<!DOCTYPE greeting SYSTEM "hello.dtd" [...]>`
249    DocType {
250        /// name of the document type, e.g. "greeting"
251        name: S,
252        /// reference to an external file, e.g. `SYSTEM "hello.dtd"`
253        external: Option<S>,
254        /// internal definition of the DTD, e.g. `...`
255        internal: Option<S>,
256    },
257    /// Processing instruction, e.g. <?xml-stylesheet type="text/css" href="style.css"?>`
258    Pi {
259        /// target, e.g. `xml-stylesheet`
260        target: S,
261        /// content, e.g. `type="text/css" href="style.css"`
262        content: Option<S>,
263    },
264    /// An element consisting of a Tag, an Attribute, and Content
265    ///
266    /// For example, `<a href="bla">Link</a>`.
267    Tac(S, Vec<(S, S)>, Option<Box<Self>>),
268    /// A sequence of XML values, e.g. `Hello<br />World`
269    Seq(Vec<Self>),
270    /// A string, e.g. `Hello world`
271    Scalar(Val),
272    /// CDATA, e.g. `<![CDATA[text]]>`
273    Cdata(S),
274    /// Comment, e.g. `<!-- text -->`
275    Comment(S),
276}
277
278impl<'a> TryFrom<&'a Val> for Xml<&'a [u8]> {
279    type Error = SError;
280    fn try_from(v: &'a Val) -> Result<Self, Self::Error> {
281        use jaq_std::ValT;
282        let from_kv = |(k, v): (&'a _, &'a _)| match (k, v) {
283            (Val::Str(k, _), Val::Str(v, _)) => Ok((&**k, &**v)),
284            _ => Err(SError::InvalidEntry("attribute", k.clone(), v.clone())),
285        };
286        let from_kvs = |a: &'a Map| a.iter().map(from_kv).collect::<Result<_, _>>();
287
288        let from_tac = |o: &'a Map| {
289            let mut t = &b""[..];
290            let mut a = Vec::new();
291            let mut c = None;
292            for (k, v) in o.iter() {
293                let fail = || SError::InvalidEntry("tac", k.clone(), v.clone());
294                let k = k.as_utf8_bytes().ok_or_else(fail)?;
295                match (k, v) {
296                    (b"t", Val::Str(s, _)) => t = s,
297                    (b"a", Val::Obj(attrs)) => a = from_kvs(attrs)?,
298                    (b"c", v) => c = Some(Box::new(v.try_into()?)),
299                    _ => Err(fail())?,
300                }
301            }
302            Ok(Self::Tac(t, a, c))
303        };
304        let from_dt = |o: &'a Map| {
305            let mut name = &b""[..];
306            let mut external = None;
307            let mut internal = None;
308            for (k, v) in o.iter() {
309                let fail = || SError::InvalidEntry("doctype", k.clone(), v.clone());
310                let k = k.as_utf8_bytes().ok_or_else(fail)?;
311                match (k, v) {
312                    (b"name", Val::Str(s, _)) => name = s,
313                    (b"external", Val::Str(s, _)) => external = Some(&**s),
314                    (b"internal", Val::Str(s, _)) => internal = Some(&**s),
315                    _ => Err(fail())?,
316                }
317            }
318            Ok(Self::DocType {
319                name,
320                external,
321                internal,
322            })
323        };
324        let from_pi = |o: &'a Map| {
325            let mut target = &b""[..];
326            let mut content = None;
327            for (k, v) in o.iter() {
328                let fail = || SError::InvalidEntry("pi", k.clone(), v.clone());
329                let k = k.as_utf8_bytes().ok_or_else(fail)?;
330                match (k, v) {
331                    (b"target", Val::Str(s, _)) => target = s,
332                    (b"content", Val::Str(s, _)) => content = Some(&**s),
333                    _ => Err(fail())?,
334                }
335            }
336            Ok(Self::Pi { target, content })
337        };
338        let contains_key = |o: &Map, k: &str| o.contains_key(&Val::from(k.to_string()));
339        match v {
340            Val::Arr(a) => a
341                .iter()
342                .map(TryInto::try_into)
343                .collect::<Result<_, _>>()
344                .map(Self::Seq),
345            Val::Obj(o) if contains_key(o, "t") => from_tac(o),
346            Val::Obj(o) => {
347                let mut o = o.iter();
348                let (k, v) = match (o.next(), o.next()) {
349                    (Some(kv), None) => kv,
350                    _ => Err(SError::SingletonObj(v.clone()))?,
351                };
352                let fail = || SError::InvalidEntry("unknown", k.clone(), v.clone());
353                let k = k.as_utf8_bytes().ok_or_else(fail)?;
354                match (k, v) {
355                    (b"xmldecl", Val::Obj(kvs)) => from_kvs(kvs).map(Self::XmlDecl),
356                    (b"doctype", Val::Obj(o)) if contains_key(o, "name") => from_dt(o),
357                    (b"cdata", Val::Str(s, _)) => Ok(Self::Cdata(s)),
358                    (b"comment", Val::Str(s, _)) => Ok(Self::Comment(s)),
359                    (b"pi", Val::Obj(o)) if contains_key(o, "target") => from_pi(o),
360                    _ => Err(fail())?,
361                }
362            }
363            Val::Null | Val::Bool(_) | Val::Num(_) | Val::Str(..) => Ok(Self::Scalar(v.clone())),
364        }
365    }
366}
367
368macro_rules! write_kvs {
369    ($w:ident, $a:ident, $f:expr) => {{
370        $a.iter().try_for_each(|(k, v)| {
371            write!($w, " ")?;
372            $f(k)?;
373            write!($w, "=\"")?;
374            $f(v)?;
375            write!($w, "\"")
376        })
377    }};
378}
379
380macro_rules! write_val {
381    ($w:ident, $v:ident, $fs:expr, $fv:expr) => {{
382        match $v {
383            Xml::Scalar(Val::Str(s, _)) => $fs(s),
384            Xml::Scalar(v) => write!($w, "{v}"),
385            Xml::Seq(a) => a.iter().try_for_each($fv),
386            Xml::Tac(t, a, c) => {
387                write!($w, "<")?;
388                $fs(t)?;
389                write_kvs!($w, a, $fs)?;
390                if let Some(c) = c {
391                    write!($w, ">")?;
392                    $fv(c)?;
393                    write!($w, "</")?;
394                    $fs(t)?;
395                    write!($w, ">")
396                } else {
397                    write!($w, "/>")
398                }
399            }
400            Xml::XmlDecl(a) => {
401                write!($w, "<?xml")?;
402                write_kvs!($w, a, $fs)?;
403                write!($w, "?>")
404            }
405            Self::DocType {
406                name,
407                external,
408                internal,
409            } => {
410                write!($w, "<!DOCTYPE ")?;
411                $fs(name)?;
412                if let Some(s) = external {
413                    write!($w, " ")?;
414                    $fs(s)?;
415                }
416                if let Some(s) = internal {
417                    write!($w, " [")?;
418                    $fs(s)?;
419                    write!($w, "]")?;
420                }
421                write!($w, ">")
422            }
423            Self::Cdata(s) => {
424                write!($w, "<![CDATA[")?;
425                $fs(s)?;
426                write!($w, "]]>")
427            }
428            Self::Comment(s) => {
429                write!($w, "<!--")?;
430                $fs(s)?;
431                write!($w, "-->")
432            }
433            Self::Pi { target, content } => {
434                write!($w, "<?")?;
435                $fs(target)?;
436                if let Some(s) = content {
437                    write!($w, " ")?;
438                    $fs(s)?;
439                }
440                write!($w, "?>")
441            }
442        }
443    }};
444}
445
446impl fmt::Display for Xml<&[u8]> {
447    fn fmt(&self, f: &mut Formatter) -> fmt::Result {
448        write_val!(f, self, |s| bstr(s).fmt(f), |v: &Self| v.fmt(f))
449    }
450}
451
452impl Xml<&[u8]> {
453    /// Write an XML value.
454    pub fn write(&self, w: &mut dyn io::Write) -> io::Result<()> {
455        write_val!(w, self, |s: &[u8]| w.write_all(s), |v: &Self| v.write(w))
456    }
457}