dtd_parser/
lib.rs

1#[macro_use]
2extern crate derive_more;
3
4use std::path::{Path, PathBuf};
5
6use either::Either;
7use indexmap::IndexMap;
8use nom::branch::alt;
9use nom::bytes::complete::{is_a, tag, take_till, take_until, take_while, take_while_m_n};
10use nom::character::complete::{anychar, char, multispace0, multispace1};
11use nom::combinator::{eof, iterator, map, recognize, value};
12use nom::error::ErrorKind;
13use nom::multi::{many0, separated_list1};
14use nom::sequence::{delimited, pair, terminated, tuple};
15use nom::Finish;
16use nom_greedyerror::{convert_error, GreedyError};
17use nom_locate::LocatedSpan;
18#[cfg(feature = "trace")]
19use nom_tracable::{cumulative_histogram, histogram};
20use nom_tracable::{tracable_parser, TracableInfo};
21
22mod attlist;
23mod element;
24mod entity;
25
26pub use attlist::{
27    AttDef, AttType, AttValue, AttlistDecl, DefaultDecl, EnumeratedType, Enumeration, NotationType,
28};
29pub use element::{Child, Choices, ElementCategory, ElementDecl, Seq};
30
31type Span<'i> = LocatedSpan<&'i str, TracableInfo>;
32
33type Result<'i, T> = nom::IResult<Span<'i>, T, GreedyError<Span<'i>, ErrorKind>>;
34
35#[cfg(test)]
36fn span(i: &str) -> Span {
37    let extra = TracableInfo::new();
38    Span::new_extra(i, extra)
39}
40
41/// like nom::dbg_dmp but eat &str.
42fn dbg_dmp<'i, F, O, E: std::fmt::Debug>(
43    mut f: F,
44    context: &'static str,
45) -> impl FnMut(&'i str) -> nom::IResult<&'i str, O, E>
46where
47    F: FnMut(&'i str) -> nom::IResult<&'i str, O, E>,
48{
49    move |i: &'i str| match f(i) {
50        Err(e) => {
51            println!("{}: Error({:?}) at:\n{}", context, e, i);
52            Err(e)
53        }
54        a => a,
55    }
56}
57
58/// 被解析的字符数据(parsed character data)
59/// 这PCDATA 是会被解析器解析的文本。些文本将被解析器检查实体以及标记。
60///
61/// 文本中的标签会被当作标记来处理,而实体会被展开。
62///
63/// 不过,被解析的字符数据不应当包含任何 &、< 或者 > 字符;需要使用 &amp;、&lt; 以及 &gt; 实体来分别替换它们。
64#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
65pub struct PCDATA(String);
66
67/// 字符数据(character data)。
68///
69/// CDATA 是不会被解析器解析的文本。在这些文本中的标签不会被当作标记来对待,其中的实体也不会被展开。
70#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
71pub struct CDATA(String);
72
73/// How namy accurrences of child.
74#[derive(Debug, Display)]
75pub enum Repeatable<T> {
76    /// Occur once.
77    #[display(fmt = "{}", "_0")]
78    Once(T),
79    /// Occur once or more times.
80    #[display(fmt = "{}+", "_0")]
81    AtLeastOnce(T),
82    /// Optional
83    #[display(fmt = "{}?", "_0")]
84    AtMostOnce(T),
85    /// Not occurring or occurring more than once
86    #[display(fmt = "{}*", "_0")]
87    ZeroOrManyTimes(T),
88}
89
90#[derive(Debug, Display, AsMut, AsRef)]
91pub struct CommentDecl;
92
93/// '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
94#[tracable_parser]
95fn comment_decl(i: Span) -> Result<CommentDecl> {
96    map(
97        value(
98            (), // Output is thrown away.
99            tuple((tag("<!--"), many0(char('-')), take_until("-->"), tag("-->"))),
100        ),
101        |_| CommentDecl,
102    )(i)
103}
104
105/// See: https://www.w3.org/TR/REC-xml/#NT-Name
106/// ```not-rust
107///     NameStartChar       ::=       ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
108///     NameChar       ::=       NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
109///     Name       ::=       NameStartChar (NameChar)*
110/// ```
111#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
112pub struct Name(String);
113
114impl Name {
115    fn to_string(&self) -> String {
116        self.0.to_string()
117    }
118}
119
120fn is_name_start(c: char) -> bool {
121    c == ':'
122        || c == '_'
123        || c.is_ascii_alphabetic()
124        || {
125            c >= unsafe { char::from_u32_unchecked(0xC0) }
126                && c >= unsafe { char::from_u32_unchecked(0xD6) }
127        }
128        || {
129            c >= unsafe { char::from_u32_unchecked(0xD8) }
130                && c >= unsafe { char::from_u32_unchecked(0xF6) }
131        }
132        || {
133            c >= unsafe { char::from_u32_unchecked(0xF8) }
134                && c >= unsafe { char::from_u32_unchecked(0x2FF) }
135        }
136        || {
137            c >= unsafe { char::from_u32_unchecked(0x370) }
138                && c >= unsafe { char::from_u32_unchecked(0x37D) }
139        }
140        || {
141            c >= unsafe { char::from_u32_unchecked(0x200C) }
142                && c >= unsafe { char::from_u32_unchecked(0x200D) }
143        }
144        || {
145            c >= unsafe { char::from_u32_unchecked(0x2070) }
146                && c >= unsafe { char::from_u32_unchecked(0x218F) }
147        }
148        || {
149            c >= unsafe { char::from_u32_unchecked(0x2C00) }
150                && c >= unsafe { char::from_u32_unchecked(0x2FEF) }
151        }
152        || {
153            c >= unsafe { char::from_u32_unchecked(0x3001) }
154                && c >= unsafe { char::from_u32_unchecked(0xD7FF) }
155        }
156        || {
157            c >= unsafe { char::from_u32_unchecked(0xF900) }
158                && c >= unsafe { char::from_u32_unchecked(0xFDCF) }
159        }
160        || {
161            c >= unsafe { char::from_u32_unchecked(0xFDF0) }
162                && c >= unsafe { char::from_u32_unchecked(0xFFFD) }
163        }
164        || {
165            c >= unsafe { char::from_u32_unchecked(0x10000) }
166                && c >= unsafe { char::from_u32_unchecked(0xEFFFF) }
167        }
168}
169
170///     NameChar       ::=       NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
171fn is_name_char(c: char) -> bool {
172    c == '-'
173        || c == '.'
174        || c.is_ascii_digit()
175        || c == unsafe { char::from_u32_unchecked(0xB7) }
176        || {
177            c >= unsafe { char::from_u32_unchecked(0x0300) }
178                && c >= unsafe { char::from_u32_unchecked(0x036F) }
179        }
180        || {
181            c >= unsafe { char::from_u32_unchecked(0x203F) }
182                && c >= unsafe { char::from_u32_unchecked(0x2040) }
183        }
184        || is_name_start(c)
185}
186
187#[tracable_parser]
188fn name(i: Span) -> Result<Name> {
189    map(
190        recognize(pair(
191            take_while_m_n(1, 1, is_name_start),
192            take_while(is_name_char),
193        )),
194        |n: Span| Name(n.to_string()),
195    )(i)
196}
197
198///     Nmtoken ::= (NameChar)+
199#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
200pub struct Nmtoken(String);
201
202///     Nmtoken ::= (NameChar)+
203#[tracable_parser]
204fn nmtoken(i: Span) -> Result<Nmtoken> {
205    map(recognize(take_while(is_name_char)), |s: Span| {
206        Nmtoken(s.to_string())
207    })(i)
208}
209
210///      Nmtokens ::= Nmtoken (#x20 Nmtoken)*
211#[derive(Debug, AsMut, AsRef, Deref, DerefMut, Into)]
212pub struct Nmtokens(Vec<Nmtoken>);
213
214///      Nmtokens ::= Nmtoken (#x20 Nmtoken)*
215#[tracable_parser]
216fn nmtokens(i: Span) -> Result<Vec<Nmtoken>> {
217    separated_list1(multispace1, nmtoken)(i)
218}
219
220#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
221#[display(
222    fmt = "( | {})",
223    "std::iter::once(\"#PCDATA\".to_string()).chain(_0.iter().map(|v|v.to_string())).collect::<Vec<_>>().join(\" | \")"
224)]
225pub struct MixedPCDATA(pub Vec<Name>);
226
227#[derive(Debug, TryInto)]
228pub enum NameOrReference {
229    Name(Name),
230    Reference(PEReference),
231}
232
233#[tracable_parser]
234fn map_name(i: Span) -> Result<NameOrReference> {
235    map(name, |n| NameOrReference::Name(n))(i)
236}
237
238#[tracable_parser]
239fn map_pereference(i: Span) -> Result<NameOrReference> {
240    map(pereference, |n| NameOrReference::Reference(n))(i)
241}
242
243#[tracable_parser]
244fn name_or_reference(i: Span) -> Result<NameOrReference> {
245    alt((map_name, map_pereference))(i)
246}
247
248#[derive(Clone, Debug, Display, TryInto)]
249pub enum CharRef {
250    #[display(fmt = "{}", "_0")]
251    Decimal(isize),
252    #[display(fmt = "{:x}", "_0")]
253    Hexadecimal(isize),
254}
255
256/// CharRef ::= '&#' [0-9]+ ';'
257///             | '&#x' [0-9a-fA-F]+ ';'    [WFC: Legal Character]
258#[tracable_parser]
259fn char_ref(i: Span) -> Result<CharRef> {
260    alt((
261        map(
262            delimited(tag("&#"), is_a("0123456789"), tag(";")),
263            |v: Span| CharRef::Decimal(isize::from_str_radix(&v, 10).unwrap()),
264        ),
265        map(
266            delimited(tag("&#x"), is_a("0123456789abcdefABCDEF"), tag(";")),
267            |v: Span| CharRef::Hexadecimal(isize::from_str_radix(&v, 16).unwrap()),
268        ),
269    ))(i)
270}
271
272#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
273pub struct EntityRef(Name);
274
275#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
276pub struct PEReference(Name);
277
278#[derive(Clone, Debug, Display, TryInto)]
279pub enum Reference {
280    #[display(fmt = "{}", "_0")]
281    CharRef(CharRef),
282    #[display(fmt = "&{};", "_0")]
283    EntityRef(EntityRef),
284}
285
286/// Reference   ::= EntityRef | CharRef
287#[tracable_parser]
288fn reference(i: Span) -> Result<Reference> {
289    alt((
290        map(entity_ref, Reference::EntityRef),
291        map(char_ref, Reference::CharRef),
292    ))(i)
293}
294
295/// PEReference ::= '%' Name ';'         [VC: Entity Declared]
296///                                      [WFC: No Recursion]
297///                                      [WFC: In DTD]
298#[tracable_parser]
299fn pereference(i: Span) -> Result<PEReference> {
300    map(delimited(tag("%"), name, tag(";")), |n| PEReference(n))(i)
301}
302
303/// EntityRef   ::= '&' Name ';'         [WFC: Entity Declared]
304///                                      [VC: Entity Declared]
305///                                      [WFC: Parsed Entity]
306///                                      [WFC: No Recursion]
307#[tracable_parser]
308fn entity_ref(i: Span) -> Result<EntityRef> {
309    map(tuple((tag("&"), name, tag(";"))), |(_, n, _)| EntityRef(n))(i)
310}
311
312#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
313pub struct SystemLiteral(String);
314
315///	SystemLiteral	   ::=   	('"' [^"]* '"') | ("'" [^']* "'")
316#[tracable_parser]
317fn system_literal(i: Span) -> Result<SystemLiteral> {
318    map(
319        alt((
320            delimited(char('"'), take_until("\""), char('"')),
321            delimited(char('\''), take_until("'"), char('\'')),
322        )),
323        |sl: Span| SystemLiteral(sl.to_string()),
324    )(i)
325}
326
327#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
328pub struct PubidLiteral(String);
329
330/// PubidLiteral	   ::=   	'"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
331#[tracable_parser]
332fn pubid_literal(i: Span) -> Result<PubidLiteral> {
333    map(
334        alt((
335            delimited(char('"'), take_till(is_pubid_char), char('"')),
336            delimited(char('\''), take_till(is_pubid_char), char('\'')),
337        )),
338        |s: Span| PubidLiteral(s.to_string()),
339    )(i)
340}
341
342/// PubidChar	   ::=   	#x20 | #xD | #xA | [a-zA-Z0-9] | [-'()+,./:=?;!*#@$_%]
343fn is_pubid_char(c: char) -> bool {
344    !(c == ' '
345        || c == '\r'
346        || c == '\n'
347        || c.is_ascii_alphanumeric()
348        || "-'()+,./:=?;!*#@$_%".contains(c))
349}
350
351#[derive(Debug, Display, TryInto)]
352pub enum ElementType {
353    #[display(fmt = "{}", "_0")]
354    Element(element::ElementDecl),
355    #[display(fmt = "{}", "_0")]
356    Entity(entity::EntityDecl),
357    #[display(fmt = "{}", "_0")]
358    Attlist(attlist::AttlistDecl),
359    #[display(fmt = "{}", "_0")]
360    Comment(CommentDecl),
361}
362
363fn entity_definitions(i: Span) -> IndexMap<String, entity::PEDecl> {
364    iterator(i, alt((map(entity::pedecl, Some), map(anychar, |_| None))))
365        .filter_map(|entity| entity.map(|entity| (entity.name().to_string(), entity)))
366        .collect()
367}
368
369pub fn resolve_entity_definitions<P: AsRef<Path>, I: Into<Option<P>>>(
370    i: Span,
371    path: I,
372) -> IndexMap<String, String> {
373    let path = path.into();
374    let definitions = entity_definitions(i);
375    let iter = definitions.into_iter();
376    let mut definitions: IndexMap<String, String> = IndexMap::new();
377    for (name, definition) in iter {
378        match definition.pedef {
379            entity::PEDef::EntityValue(values) => {
380                let mut value = Vec::with_capacity(values.len());
381                for value_or_reference in values.into_iter() {
382                    // println!("entity: {} -> {}", &name, &value_or_reference);
383                    let v = match value_or_reference {
384                        entity::ValueOrReference::Value(value) => value.into(),
385                        entity::ValueOrReference::Reference(reference) => reference.to_string(),
386                        entity::ValueOrReference::PEReference(pereference) => {
387                            match definitions.get(&pereference.to_string()) {
388                                Some(def) => def.to_owned(),
389                                None => {
390                                    eprintln!(
391                                        "ERROR: PEReference(`{}`) is not defined yet.",
392                                        pereference
393                                    );
394                                    continue;
395                                }
396                            }
397                        }
398                    };
399                    value.push(v);
400                }
401                definitions.insert(name.to_owned(), value.join(" "));
402            }
403            entity::PEDef::ExternalID(external_id) => match external_id {
404                entity::ExternalID::SystemLiteral(system_literal) => {
405                    eprintln!(
406                            "ERROR: ExternalID SystemLiteral(`{}`) not implemented yet, this will cause problom.",
407                            system_literal
408                        );
409                    continue;
410                }
411                entity::ExternalID::PubidLiteralWithSystemLiteral(
412                    _pubid_literal,
413                    system_literal,
414                ) => {
415                    if system_literal.starts_with("http") || system_literal.starts_with("ftp") {
416                        eprintln!(
417                                "ERROR: ExternalID PubidLiteral SystemLiteral(`{}`) from network not implemented yet, this will cause problom.",
418                                system_literal
419                            );
420                    }
421                    let include = if let Some(ref path) = path {
422                        let path: &Path = path.as_ref();
423                        if let Some(_ext) = path.extension() {
424                            path.canonicalize()
425                                .unwrap()
426                                .with_file_name(system_literal.as_ref())
427                        } else {
428                            path.canonicalize().unwrap().join(system_literal.as_ref())
429                        }
430                    } else {
431                        PathBuf::from(system_literal.as_ref())
432                    };
433                    // FIXME: this is still not defined.
434                    match std::fs::read_to_string(&include) {
435                        Err(err) => {
436                            eprintln!(
437                                    "ERROR: Failed to include ExternalID PubidLiteral SystemLiteral(`{}`), {}",
438                                    system_literal,
439                                    &err
440                                );
441                        }
442                        Ok(included) => definitions.extend(
443                            resolve_entity_definitions::<PathBuf, Option<PathBuf>>(
444                                Span::new_extra(&included, i.extra),
445                                include.into(),
446                            )
447                            .into_iter(),
448                        ),
449                    }
450                    continue;
451                }
452            },
453        }
454    }
455    definitions
456}
457
458pub fn resolve_references(i: Span, definitions: &IndexMap<String, String>) -> String {
459    iterator(
460        i,
461        alt((
462            map(delimited(tag("%"), name, tag(";")), Either::Left),
463            map(recognize(anychar), Either::Right),
464        )),
465    )
466    .map(|either| match either {
467        Either::Left(name) => match definitions.get(name.as_ref()) {
468            Some(definition) => definition.as_str(),
469            None => {
470                eprintln!("ERROR: PEReference(`{}`) is not defined yet.", &name);
471                ""
472            }
473        },
474        Either::Right(chars) => *chars,
475    })
476    .collect::<Vec<_>>()
477    .join("")
478}
479
480pub fn parse<F: AsRef<Path>>(f: F) -> std::result::Result<Vec<ElementType>, String> {
481    let f = f.as_ref();
482    let content =
483        std::fs::read_to_string(f).expect(&format!("Can not read from file {}", f.display()));
484
485    let tracer = TracableInfo::new().fold("entity-resolver");
486    let span = LocatedSpan::new_extra(content.as_str(), tracer);
487
488    let definitions = resolve_entity_definitions::<&Path, Option<&Path>>(span, f.into());
489    let span = LocatedSpan::new_extra(content.as_str(), tracer);
490    let resolved = resolve_references(span, &definitions);
491    // println!("resolved -----------------------------\n{}", &resolved);
492    // std::fs::write("/tmp/resolved.dtd", &resolved).unwrap();
493    let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
494    #[cfg(feature = "trace")]
495    histogram();
496    #[cfg(feature = "trace")]
497    cumulative_histogram();
498    let result = terminated(
499        many0(alt((
500            map(
501                delimited(multispace0, attlist::attlist_decl, multispace0),
502                ElementType::Attlist,
503            ),
504            map(
505                delimited(multispace0, element::element_decl, multispace0),
506                ElementType::Element,
507            ),
508            map(
509                delimited(multispace0, entity::entity_decl, multispace0),
510                ElementType::Entity,
511            ),
512            map(
513                delimited(multispace0, comment_decl, multispace0),
514                ElementType::Comment,
515            ),
516        ))),
517        eof,
518    )(span)
519    .finish()
520    .map(|(_, definitions)| definitions)
521    .map_err(|err| convert_error(span, err));
522    result
523}
524
525pub fn parse_str(i: &str) -> std::result::Result<Vec<ElementType>, String> {
526    let tracer = TracableInfo::new().fold("entity-resolver");
527    let span = LocatedSpan::new_extra(i, tracer);
528    let definitions = resolve_entity_definitions::<&str, Option<&str>>(span, None);
529    let span = LocatedSpan::new_extra(i, tracer);
530    let resolved = resolve_references(span, &definitions);
531    let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
532    #[cfg(feature = "trace")]
533    histogram();
534    #[cfg(feature = "trace")]
535    cumulative_histogram();
536    let result = terminated(
537        many0(alt((
538            map(
539                delimited(multispace0, attlist::attlist_decl, multispace0),
540                ElementType::Attlist,
541            ),
542            map(
543                delimited(multispace0, element::element_decl, multispace0),
544                ElementType::Element,
545            ),
546            map(
547                delimited(multispace0, entity::entity_decl, multispace0),
548                ElementType::Entity,
549            ),
550            map(
551                delimited(multispace0, comment_decl, multispace0),
552                ElementType::Comment,
553            ),
554        ))),
555        eof,
556    )(span)
557    .finish()
558    .map(|(_, elements)| elements)
559    .map_err(|err| convert_error(span, err));
560    result
561}
562
563#[cfg(test)]
564mod tests {
565    use nom::Finish;
566
567    use super::{comment_decl, pereference, span};
568
569    #[test]
570    fn test_comment_decl() {
571        let result = comment_decl(span(
572            r#"<!--
573======================================================================
574    Docutils Generic DTD
575======================================================================
576:Author: David Goodger
577:Contact: docutils-develop@lists.sourceforge.net
578:Revision: $Revision: 8767 $
579:Date: $Date: 2021-06-17 16:33:28 +0200 (Do, 17. Jun 2021) $
580:Copyright: This DTD has been placed in the public domain.
581:Filename: docutils.dtd
582
583More information about this DTD (document type definition) and the
584Docutils project can be found at http://docutils.sourceforge.net/.
585The latest version of this DTD is available from
586http://docutils.sourceforge.net/docs/ref/docutils.dtd.
587
588The formal public identifier for this DTD is::
589
590    +//IDN docutils.sourceforge.net//DTD Docutils Generic//EN//XML
591-->"#,
592        ))
593        .finish();
594        assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
595    }
596
597    #[test]
598    fn test_pereference() {
599        let result = pereference(span("%align-h.att;")).finish();
600        assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
601    }
602}
603
604#[cfg(test)]
605#[macro_export]
606macro_rules! assert_ok {
607    ($span:ident, $res:expr) => {
608        match $res {
609            Ok(_) => {
610                assert!(true);
611            }
612            Err(err) => {
613                assert!(false, "{}", ::nom_greedyerror::convert_error($span, err));
614            }
615        }
616    };
617}