1#[macro_use]
2extern crate derive_more;
3
4use std::path::{Path, PathBuf};
5
6use either::Either;
7use indexmap::IndexMap;
8use nom::branch::alt;
9use nom::bytes::complete::{is_a, tag, take_till, take_until, take_while, take_while_m_n};
10use nom::character::complete::{anychar, char, multispace0, multispace1};
11use nom::combinator::{eof, iterator, map, recognize, value};
12use nom::error::ErrorKind;
13use nom::multi::{many0, separated_list1};
14use nom::sequence::{delimited, pair, terminated, tuple};
15use nom::Finish;
16use nom_greedyerror::{convert_error, GreedyError};
17use nom_locate::LocatedSpan;
18#[cfg(feature = "trace")]
19use nom_tracable::{cumulative_histogram, histogram};
20use nom_tracable::{tracable_parser, TracableInfo};
21
22mod attlist;
23mod element;
24mod entity;
25
26pub use attlist::{
27 AttDef, AttType, AttValue, AttlistDecl, DefaultDecl, EnumeratedType, Enumeration, NotationType,
28};
29pub use element::{Child, Choices, ElementCategory, ElementDecl, Seq};
30
31type Span<'i> = LocatedSpan<&'i str, TracableInfo>;
32
33type Result<'i, T> = nom::IResult<Span<'i>, T, GreedyError<Span<'i>, ErrorKind>>;
34
35#[cfg(test)]
36fn span(i: &str) -> Span {
37 let extra = TracableInfo::new();
38 Span::new_extra(i, extra)
39}
40
41fn dbg_dmp<'i, F, O, E: std::fmt::Debug>(
43 mut f: F,
44 context: &'static str,
45) -> impl FnMut(&'i str) -> nom::IResult<&'i str, O, E>
46where
47 F: FnMut(&'i str) -> nom::IResult<&'i str, O, E>,
48{
49 move |i: &'i str| match f(i) {
50 Err(e) => {
51 println!("{}: Error({:?}) at:\n{}", context, e, i);
52 Err(e)
53 }
54 a => a,
55 }
56}
57
58#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
65pub struct PCDATA(String);
66
67#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
71pub struct CDATA(String);
72
73#[derive(Debug, Display)]
75pub enum Repeatable<T> {
76 #[display(fmt = "{}", "_0")]
78 Once(T),
79 #[display(fmt = "{}+", "_0")]
81 AtLeastOnce(T),
82 #[display(fmt = "{}?", "_0")]
84 AtMostOnce(T),
85 #[display(fmt = "{}*", "_0")]
87 ZeroOrManyTimes(T),
88}
89
90#[derive(Debug, Display, AsMut, AsRef)]
91pub struct CommentDecl;
92
93#[tracable_parser]
95fn comment_decl(i: Span) -> Result<CommentDecl> {
96 map(
97 value(
98 (), tuple((tag("<!--"), many0(char('-')), take_until("-->"), tag("-->"))),
100 ),
101 |_| CommentDecl,
102 )(i)
103}
104
105#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
112pub struct Name(String);
113
114impl Name {
115 fn to_string(&self) -> String {
116 self.0.to_string()
117 }
118}
119
120fn is_name_start(c: char) -> bool {
121 c == ':'
122 || c == '_'
123 || c.is_ascii_alphabetic()
124 || {
125 c >= unsafe { char::from_u32_unchecked(0xC0) }
126 && c >= unsafe { char::from_u32_unchecked(0xD6) }
127 }
128 || {
129 c >= unsafe { char::from_u32_unchecked(0xD8) }
130 && c >= unsafe { char::from_u32_unchecked(0xF6) }
131 }
132 || {
133 c >= unsafe { char::from_u32_unchecked(0xF8) }
134 && c >= unsafe { char::from_u32_unchecked(0x2FF) }
135 }
136 || {
137 c >= unsafe { char::from_u32_unchecked(0x370) }
138 && c >= unsafe { char::from_u32_unchecked(0x37D) }
139 }
140 || {
141 c >= unsafe { char::from_u32_unchecked(0x200C) }
142 && c >= unsafe { char::from_u32_unchecked(0x200D) }
143 }
144 || {
145 c >= unsafe { char::from_u32_unchecked(0x2070) }
146 && c >= unsafe { char::from_u32_unchecked(0x218F) }
147 }
148 || {
149 c >= unsafe { char::from_u32_unchecked(0x2C00) }
150 && c >= unsafe { char::from_u32_unchecked(0x2FEF) }
151 }
152 || {
153 c >= unsafe { char::from_u32_unchecked(0x3001) }
154 && c >= unsafe { char::from_u32_unchecked(0xD7FF) }
155 }
156 || {
157 c >= unsafe { char::from_u32_unchecked(0xF900) }
158 && c >= unsafe { char::from_u32_unchecked(0xFDCF) }
159 }
160 || {
161 c >= unsafe { char::from_u32_unchecked(0xFDF0) }
162 && c >= unsafe { char::from_u32_unchecked(0xFFFD) }
163 }
164 || {
165 c >= unsafe { char::from_u32_unchecked(0x10000) }
166 && c >= unsafe { char::from_u32_unchecked(0xEFFFF) }
167 }
168}
169
170fn is_name_char(c: char) -> bool {
172 c == '-'
173 || c == '.'
174 || c.is_ascii_digit()
175 || c == unsafe { char::from_u32_unchecked(0xB7) }
176 || {
177 c >= unsafe { char::from_u32_unchecked(0x0300) }
178 && c >= unsafe { char::from_u32_unchecked(0x036F) }
179 }
180 || {
181 c >= unsafe { char::from_u32_unchecked(0x203F) }
182 && c >= unsafe { char::from_u32_unchecked(0x2040) }
183 }
184 || is_name_start(c)
185}
186
187#[tracable_parser]
188fn name(i: Span) -> Result<Name> {
189 map(
190 recognize(pair(
191 take_while_m_n(1, 1, is_name_start),
192 take_while(is_name_char),
193 )),
194 |n: Span| Name(n.to_string()),
195 )(i)
196}
197
198#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
200pub struct Nmtoken(String);
201
202#[tracable_parser]
204fn nmtoken(i: Span) -> Result<Nmtoken> {
205 map(recognize(take_while(is_name_char)), |s: Span| {
206 Nmtoken(s.to_string())
207 })(i)
208}
209
210#[derive(Debug, AsMut, AsRef, Deref, DerefMut, Into)]
212pub struct Nmtokens(Vec<Nmtoken>);
213
214#[tracable_parser]
216fn nmtokens(i: Span) -> Result<Vec<Nmtoken>> {
217 separated_list1(multispace1, nmtoken)(i)
218}
219
220#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
221#[display(
222 fmt = "( | {})",
223 "std::iter::once(\"#PCDATA\".to_string()).chain(_0.iter().map(|v|v.to_string())).collect::<Vec<_>>().join(\" | \")"
224)]
225pub struct MixedPCDATA(pub Vec<Name>);
226
227#[derive(Debug, TryInto)]
228pub enum NameOrReference {
229 Name(Name),
230 Reference(PEReference),
231}
232
233#[tracable_parser]
234fn map_name(i: Span) -> Result<NameOrReference> {
235 map(name, |n| NameOrReference::Name(n))(i)
236}
237
238#[tracable_parser]
239fn map_pereference(i: Span) -> Result<NameOrReference> {
240 map(pereference, |n| NameOrReference::Reference(n))(i)
241}
242
243#[tracable_parser]
244fn name_or_reference(i: Span) -> Result<NameOrReference> {
245 alt((map_name, map_pereference))(i)
246}
247
248#[derive(Clone, Debug, Display, TryInto)]
249pub enum CharRef {
250 #[display(fmt = "{}", "_0")]
251 Decimal(isize),
252 #[display(fmt = "{:x}", "_0")]
253 Hexadecimal(isize),
254}
255
256#[tracable_parser]
259fn char_ref(i: Span) -> Result<CharRef> {
260 alt((
261 map(
262 delimited(tag("&#"), is_a("0123456789"), tag(";")),
263 |v: Span| CharRef::Decimal(isize::from_str_radix(&v, 10).unwrap()),
264 ),
265 map(
266 delimited(tag("&#x"), is_a("0123456789abcdefABCDEF"), tag(";")),
267 |v: Span| CharRef::Hexadecimal(isize::from_str_radix(&v, 16).unwrap()),
268 ),
269 ))(i)
270}
271
272#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
273pub struct EntityRef(Name);
274
275#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
276pub struct PEReference(Name);
277
278#[derive(Clone, Debug, Display, TryInto)]
279pub enum Reference {
280 #[display(fmt = "{}", "_0")]
281 CharRef(CharRef),
282 #[display(fmt = "&{};", "_0")]
283 EntityRef(EntityRef),
284}
285
286#[tracable_parser]
288fn reference(i: Span) -> Result<Reference> {
289 alt((
290 map(entity_ref, Reference::EntityRef),
291 map(char_ref, Reference::CharRef),
292 ))(i)
293}
294
295#[tracable_parser]
299fn pereference(i: Span) -> Result<PEReference> {
300 map(delimited(tag("%"), name, tag(";")), |n| PEReference(n))(i)
301}
302
303#[tracable_parser]
308fn entity_ref(i: Span) -> Result<EntityRef> {
309 map(tuple((tag("&"), name, tag(";"))), |(_, n, _)| EntityRef(n))(i)
310}
311
312#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
313pub struct SystemLiteral(String);
314
315#[tracable_parser]
317fn system_literal(i: Span) -> Result<SystemLiteral> {
318 map(
319 alt((
320 delimited(char('"'), take_until("\""), char('"')),
321 delimited(char('\''), take_until("'"), char('\'')),
322 )),
323 |sl: Span| SystemLiteral(sl.to_string()),
324 )(i)
325}
326
327#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
328pub struct PubidLiteral(String);
329
330#[tracable_parser]
332fn pubid_literal(i: Span) -> Result<PubidLiteral> {
333 map(
334 alt((
335 delimited(char('"'), take_till(is_pubid_char), char('"')),
336 delimited(char('\''), take_till(is_pubid_char), char('\'')),
337 )),
338 |s: Span| PubidLiteral(s.to_string()),
339 )(i)
340}
341
342fn is_pubid_char(c: char) -> bool {
344 !(c == ' '
345 || c == '\r'
346 || c == '\n'
347 || c.is_ascii_alphanumeric()
348 || "-'()+,./:=?;!*#@$_%".contains(c))
349}
350
351#[derive(Debug, Display, TryInto)]
352pub enum ElementType {
353 #[display(fmt = "{}", "_0")]
354 Element(element::ElementDecl),
355 #[display(fmt = "{}", "_0")]
356 Entity(entity::EntityDecl),
357 #[display(fmt = "{}", "_0")]
358 Attlist(attlist::AttlistDecl),
359 #[display(fmt = "{}", "_0")]
360 Comment(CommentDecl),
361}
362
363fn entity_definitions(i: Span) -> IndexMap<String, entity::PEDecl> {
364 iterator(i, alt((map(entity::pedecl, Some), map(anychar, |_| None))))
365 .filter_map(|entity| entity.map(|entity| (entity.name().to_string(), entity)))
366 .collect()
367}
368
369pub fn resolve_entity_definitions<P: AsRef<Path>, I: Into<Option<P>>>(
370 i: Span,
371 path: I,
372) -> IndexMap<String, String> {
373 let path = path.into();
374 let definitions = entity_definitions(i);
375 let iter = definitions.into_iter();
376 let mut definitions: IndexMap<String, String> = IndexMap::new();
377 for (name, definition) in iter {
378 match definition.pedef {
379 entity::PEDef::EntityValue(values) => {
380 let mut value = Vec::with_capacity(values.len());
381 for value_or_reference in values.into_iter() {
382 let v = match value_or_reference {
384 entity::ValueOrReference::Value(value) => value.into(),
385 entity::ValueOrReference::Reference(reference) => reference.to_string(),
386 entity::ValueOrReference::PEReference(pereference) => {
387 match definitions.get(&pereference.to_string()) {
388 Some(def) => def.to_owned(),
389 None => {
390 eprintln!(
391 "ERROR: PEReference(`{}`) is not defined yet.",
392 pereference
393 );
394 continue;
395 }
396 }
397 }
398 };
399 value.push(v);
400 }
401 definitions.insert(name.to_owned(), value.join(" "));
402 }
403 entity::PEDef::ExternalID(external_id) => match external_id {
404 entity::ExternalID::SystemLiteral(system_literal) => {
405 eprintln!(
406 "ERROR: ExternalID SystemLiteral(`{}`) not implemented yet, this will cause problom.",
407 system_literal
408 );
409 continue;
410 }
411 entity::ExternalID::PubidLiteralWithSystemLiteral(
412 _pubid_literal,
413 system_literal,
414 ) => {
415 if system_literal.starts_with("http") || system_literal.starts_with("ftp") {
416 eprintln!(
417 "ERROR: ExternalID PubidLiteral SystemLiteral(`{}`) from network not implemented yet, this will cause problom.",
418 system_literal
419 );
420 }
421 let include = if let Some(ref path) = path {
422 let path: &Path = path.as_ref();
423 if let Some(_ext) = path.extension() {
424 path.canonicalize()
425 .unwrap()
426 .with_file_name(system_literal.as_ref())
427 } else {
428 path.canonicalize().unwrap().join(system_literal.as_ref())
429 }
430 } else {
431 PathBuf::from(system_literal.as_ref())
432 };
433 match std::fs::read_to_string(&include) {
435 Err(err) => {
436 eprintln!(
437 "ERROR: Failed to include ExternalID PubidLiteral SystemLiteral(`{}`), {}",
438 system_literal,
439 &err
440 );
441 }
442 Ok(included) => definitions.extend(
443 resolve_entity_definitions::<PathBuf, Option<PathBuf>>(
444 Span::new_extra(&included, i.extra),
445 include.into(),
446 )
447 .into_iter(),
448 ),
449 }
450 continue;
451 }
452 },
453 }
454 }
455 definitions
456}
457
458pub fn resolve_references(i: Span, definitions: &IndexMap<String, String>) -> String {
459 iterator(
460 i,
461 alt((
462 map(delimited(tag("%"), name, tag(";")), Either::Left),
463 map(recognize(anychar), Either::Right),
464 )),
465 )
466 .map(|either| match either {
467 Either::Left(name) => match definitions.get(name.as_ref()) {
468 Some(definition) => definition.as_str(),
469 None => {
470 eprintln!("ERROR: PEReference(`{}`) is not defined yet.", &name);
471 ""
472 }
473 },
474 Either::Right(chars) => *chars,
475 })
476 .collect::<Vec<_>>()
477 .join("")
478}
479
480pub fn parse<F: AsRef<Path>>(f: F) -> std::result::Result<Vec<ElementType>, String> {
481 let f = f.as_ref();
482 let content =
483 std::fs::read_to_string(f).expect(&format!("Can not read from file {}", f.display()));
484
485 let tracer = TracableInfo::new().fold("entity-resolver");
486 let span = LocatedSpan::new_extra(content.as_str(), tracer);
487
488 let definitions = resolve_entity_definitions::<&Path, Option<&Path>>(span, f.into());
489 let span = LocatedSpan::new_extra(content.as_str(), tracer);
490 let resolved = resolve_references(span, &definitions);
491 let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
494 #[cfg(feature = "trace")]
495 histogram();
496 #[cfg(feature = "trace")]
497 cumulative_histogram();
498 let result = terminated(
499 many0(alt((
500 map(
501 delimited(multispace0, attlist::attlist_decl, multispace0),
502 ElementType::Attlist,
503 ),
504 map(
505 delimited(multispace0, element::element_decl, multispace0),
506 ElementType::Element,
507 ),
508 map(
509 delimited(multispace0, entity::entity_decl, multispace0),
510 ElementType::Entity,
511 ),
512 map(
513 delimited(multispace0, comment_decl, multispace0),
514 ElementType::Comment,
515 ),
516 ))),
517 eof,
518 )(span)
519 .finish()
520 .map(|(_, definitions)| definitions)
521 .map_err(|err| convert_error(span, err));
522 result
523}
524
525pub fn parse_str(i: &str) -> std::result::Result<Vec<ElementType>, String> {
526 let tracer = TracableInfo::new().fold("entity-resolver");
527 let span = LocatedSpan::new_extra(i, tracer);
528 let definitions = resolve_entity_definitions::<&str, Option<&str>>(span, None);
529 let span = LocatedSpan::new_extra(i, tracer);
530 let resolved = resolve_references(span, &definitions);
531 let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
532 #[cfg(feature = "trace")]
533 histogram();
534 #[cfg(feature = "trace")]
535 cumulative_histogram();
536 let result = terminated(
537 many0(alt((
538 map(
539 delimited(multispace0, attlist::attlist_decl, multispace0),
540 ElementType::Attlist,
541 ),
542 map(
543 delimited(multispace0, element::element_decl, multispace0),
544 ElementType::Element,
545 ),
546 map(
547 delimited(multispace0, entity::entity_decl, multispace0),
548 ElementType::Entity,
549 ),
550 map(
551 delimited(multispace0, comment_decl, multispace0),
552 ElementType::Comment,
553 ),
554 ))),
555 eof,
556 )(span)
557 .finish()
558 .map(|(_, elements)| elements)
559 .map_err(|err| convert_error(span, err));
560 result
561}
562
563#[cfg(test)]
564mod tests {
565 use nom::Finish;
566
567 use super::{comment_decl, pereference, span};
568
569 #[test]
570 fn test_comment_decl() {
571 let result = comment_decl(span(
572 r#"<!--
573======================================================================
574 Docutils Generic DTD
575======================================================================
576:Author: David Goodger
577:Contact: docutils-develop@lists.sourceforge.net
578:Revision: $Revision: 8767 $
579:Date: $Date: 2021-06-17 16:33:28 +0200 (Do, 17. Jun 2021) $
580:Copyright: This DTD has been placed in the public domain.
581:Filename: docutils.dtd
582
583More information about this DTD (document type definition) and the
584Docutils project can be found at http://docutils.sourceforge.net/.
585The latest version of this DTD is available from
586http://docutils.sourceforge.net/docs/ref/docutils.dtd.
587
588The formal public identifier for this DTD is::
589
590 +//IDN docutils.sourceforge.net//DTD Docutils Generic//EN//XML
591-->"#,
592 ))
593 .finish();
594 assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
595 }
596
597 #[test]
598 fn test_pereference() {
599 let result = pereference(span("%align-h.att;")).finish();
600 assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
601 }
602}
603
604#[cfg(test)]
605#[macro_export]
606macro_rules! assert_ok {
607 ($span:ident, $res:expr) => {
608 match $res {
609 Ok(_) => {
610 assert!(true);
611 }
612 Err(err) => {
613 assert!(false, "{}", ::nom_greedyerror::convert_error($span, err));
614 }
615 }
616 };
617}