#[macro_use]
extern crate derive_more;
use std::path::{Path, PathBuf};
use either::Either;
use indexmap::IndexMap;
use nom::branch::alt;
use nom::bytes::complete::{is_a, tag, take_till, take_until, take_while, take_while_m_n};
use nom::character::complete::{anychar, char, multispace0, multispace1};
use nom::combinator::{eof, iterator, map, recognize, value};
use nom::error::ErrorKind;
use nom::multi::{many0, separated_list1};
use nom::sequence::{delimited, pair, terminated, tuple};
use nom::Finish;
use nom_greedyerror::{convert_error, GreedyError};
use nom_locate::LocatedSpan;
#[cfg(feature = "trace")]
use nom_tracable::{cumulative_histogram, histogram};
use nom_tracable::{tracable_parser, TracableInfo};
mod attlist;
mod element;
mod entity;
pub use attlist::{
AttDef, AttType, AttValue, AttlistDecl, DefaultDecl, EnumeratedType, Enumeration, NotationType,
};
pub use element::{Child, Choices, ElementCategory, ElementDecl, Seq};
type Span<'i> = LocatedSpan<&'i str, TracableInfo>;
type Result<'i, T> = nom::IResult<Span<'i>, T, GreedyError<Span<'i>, ErrorKind>>;
#[cfg(test)]
fn span(i: &str) -> Span {
let extra = TracableInfo::new();
Span::new_extra(i, extra)
}
fn dbg_dmp<'i, F, O, E: std::fmt::Debug>(
mut f: F,
context: &'static str,
) -> impl FnMut(&'i str) -> nom::IResult<&'i str, O, E>
where
F: FnMut(&'i str) -> nom::IResult<&'i str, O, E>,
{
move |i: &'i str| match f(i) {
Err(e) => {
println!("{}: Error({:?}) at:\n{}", context, e, i);
Err(e)
}
a => a,
}
}
#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct PCDATA(String);
#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct CDATA(String);
#[derive(Debug, Display)]
pub enum Repeatable<T> {
#[display(fmt = "{}", "_0")]
Once(T),
#[display(fmt = "{}+", "_0")]
AtLeastOnce(T),
#[display(fmt = "{}?", "_0")]
AtMostOnce(T),
#[display(fmt = "{}*", "_0")]
ZeroOrManyTimes(T),
}
#[derive(Debug, Display, AsMut, AsRef)]
pub struct CommentDecl;
#[tracable_parser]
fn comment_decl(i: Span) -> Result<CommentDecl> {
map(
value(
(), tuple((tag("<!--"), many0(char('-')), take_until("-->"), tag("-->"))),
),
|_| CommentDecl,
)(i)
}
#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct Name(String);
impl Name {
fn to_string(&self) -> String {
self.0.to_string()
}
}
fn is_name_start(c: char) -> bool {
c == ':'
|| c == '_'
|| c.is_ascii_alphabetic()
|| {
c >= unsafe { char::from_u32_unchecked(0xC0) }
&& c >= unsafe { char::from_u32_unchecked(0xD6) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0xD8) }
&& c >= unsafe { char::from_u32_unchecked(0xF6) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0xF8) }
&& c >= unsafe { char::from_u32_unchecked(0x2FF) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x370) }
&& c >= unsafe { char::from_u32_unchecked(0x37D) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x200C) }
&& c >= unsafe { char::from_u32_unchecked(0x200D) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x2070) }
&& c >= unsafe { char::from_u32_unchecked(0x218F) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x2C00) }
&& c >= unsafe { char::from_u32_unchecked(0x2FEF) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x3001) }
&& c >= unsafe { char::from_u32_unchecked(0xD7FF) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0xF900) }
&& c >= unsafe { char::from_u32_unchecked(0xFDCF) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0xFDF0) }
&& c >= unsafe { char::from_u32_unchecked(0xFFFD) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x10000) }
&& c >= unsafe { char::from_u32_unchecked(0xEFFFF) }
}
}
fn is_name_char(c: char) -> bool {
c == '-'
|| c == '.'
|| c.is_ascii_digit()
|| c == unsafe { char::from_u32_unchecked(0xB7) }
|| {
c >= unsafe { char::from_u32_unchecked(0x0300) }
&& c >= unsafe { char::from_u32_unchecked(0x036F) }
}
|| {
c >= unsafe { char::from_u32_unchecked(0x203F) }
&& c >= unsafe { char::from_u32_unchecked(0x2040) }
}
|| is_name_start(c)
}
#[tracable_parser]
fn name(i: Span) -> Result<Name> {
map(
recognize(pair(
take_while_m_n(1, 1, is_name_start),
take_while(is_name_char),
)),
|n: Span| Name(n.to_string()),
)(i)
}
#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct Nmtoken(String);
#[tracable_parser]
fn nmtoken(i: Span) -> Result<Nmtoken> {
map(recognize(take_while(is_name_char)), |s: Span| {
Nmtoken(s.to_string())
})(i)
}
#[derive(Debug, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct Nmtokens(Vec<Nmtoken>);
#[tracable_parser]
fn nmtokens(i: Span) -> Result<Vec<Nmtoken>> {
separated_list1(multispace1, nmtoken)(i)
}
#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
#[display(
fmt = "( | {})",
"std::iter::once(\"#PCDATA\".to_string()).chain(_0.iter().map(|v|v.to_string())).collect::<Vec<_>>().join(\" | \")"
)]
pub struct MixedPCDATA(pub Vec<Name>);
#[derive(Debug, TryInto)]
pub enum NameOrReference {
Name(Name),
Reference(PEReference),
}
#[tracable_parser]
fn map_name(i: Span) -> Result<NameOrReference> {
map(name, |n| NameOrReference::Name(n))(i)
}
#[tracable_parser]
fn map_pereference(i: Span) -> Result<NameOrReference> {
map(pereference, |n| NameOrReference::Reference(n))(i)
}
#[tracable_parser]
fn name_or_reference(i: Span) -> Result<NameOrReference> {
alt((map_name, map_pereference))(i)
}
#[derive(Clone, Debug, Display, TryInto)]
pub enum CharRef {
#[display(fmt = "{}", "_0")]
Decimal(isize),
#[display(fmt = "{:x}", "_0")]
Hexadecimal(isize),
}
#[tracable_parser]
fn char_ref(i: Span) -> Result<CharRef> {
alt((
map(
delimited(tag("&#"), is_a("0123456789"), tag(";")),
|v: Span| CharRef::Decimal(isize::from_str_radix(&v, 10).unwrap()),
),
map(
delimited(tag("&#x"), is_a("0123456789abcdefABCDEF"), tag(";")),
|v: Span| CharRef::Hexadecimal(isize::from_str_radix(&v, 16).unwrap()),
),
))(i)
}
#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct EntityRef(Name);
#[derive(Clone, Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct PEReference(Name);
#[derive(Clone, Debug, Display, TryInto)]
pub enum Reference {
#[display(fmt = "{}", "_0")]
CharRef(CharRef),
#[display(fmt = "&{};", "_0")]
EntityRef(EntityRef),
}
#[tracable_parser]
fn reference(i: Span) -> Result<Reference> {
alt((
map(entity_ref, Reference::EntityRef),
map(char_ref, Reference::CharRef),
))(i)
}
#[tracable_parser]
fn pereference(i: Span) -> Result<PEReference> {
map(delimited(tag("%"), name, tag(";")), |n| PEReference(n))(i)
}
#[tracable_parser]
fn entity_ref(i: Span) -> Result<EntityRef> {
map(tuple((tag("&"), name, tag(";"))), |(_, n, _)| EntityRef(n))(i)
}
#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct SystemLiteral(String);
#[tracable_parser]
fn system_literal(i: Span) -> Result<SystemLiteral> {
map(
alt((
delimited(char('"'), take_until("\""), char('"')),
delimited(char('\''), take_until("'"), char('\'')),
)),
|sl: Span| SystemLiteral(sl.to_string()),
)(i)
}
#[derive(Debug, Display, AsMut, AsRef, Deref, DerefMut, Into)]
pub struct PubidLiteral(String);
#[tracable_parser]
fn pubid_literal(i: Span) -> Result<PubidLiteral> {
map(
alt((
delimited(char('"'), take_till(is_pubid_char), char('"')),
delimited(char('\''), take_till(is_pubid_char), char('\'')),
)),
|s: Span| PubidLiteral(s.to_string()),
)(i)
}
fn is_pubid_char(c: char) -> bool {
!(c == ' '
|| c == '\r'
|| c == '\n'
|| c.is_ascii_alphanumeric()
|| "-'()+,./:=?;!*#@$_%".contains(c))
}
#[derive(Debug, Display, TryInto)]
pub enum ElementType {
#[display(fmt = "{}", "_0")]
Element(element::ElementDecl),
#[display(fmt = "{}", "_0")]
Entity(entity::EntityDecl),
#[display(fmt = "{}", "_0")]
Attlist(attlist::AttlistDecl),
#[display(fmt = "{}", "_0")]
Comment(CommentDecl),
}
fn entity_definitions(i: Span) -> IndexMap<String, entity::PEDecl> {
iterator(i, alt((map(entity::pedecl, Some), map(anychar, |_| None))))
.filter_map(|entity| entity.map(|entity| (entity.name().to_string(), entity)))
.collect()
}
pub fn resolve_entity_definitions<P: AsRef<Path>, I: Into<Option<P>>>(
i: Span,
path: I,
) -> IndexMap<String, String> {
let path = path.into();
let definitions = entity_definitions(i);
let iter = definitions.into_iter();
let mut definitions: IndexMap<String, String> = IndexMap::new();
for (name, definition) in iter {
match definition.pedef {
entity::PEDef::EntityValue(values) => {
let mut value = Vec::with_capacity(values.len());
for value_or_reference in values.into_iter() {
let v = match value_or_reference {
entity::ValueOrReference::Value(value) => value.into(),
entity::ValueOrReference::Reference(reference) => reference.to_string(),
entity::ValueOrReference::PEReference(pereference) => {
match definitions.get(&pereference.to_string()) {
Some(def) => def.to_owned(),
None => {
eprintln!(
"ERROR: PEReference(`{}`) is not defined yet.",
pereference
);
continue;
}
}
}
};
value.push(v);
}
definitions.insert(name.to_owned(), value.join(" "));
}
entity::PEDef::ExternalID(external_id) => match external_id {
entity::ExternalID::SystemLiteral(system_literal) => {
eprintln!(
"ERROR: ExternalID SystemLiteral(`{}`) not implemented yet, this will cause problom.",
system_literal
);
continue;
}
entity::ExternalID::PubidLiteralWithSystemLiteral(
_pubid_literal,
system_literal,
) => {
if system_literal.starts_with("http") || system_literal.starts_with("ftp") {
eprintln!(
"ERROR: ExternalID PubidLiteral SystemLiteral(`{}`) from network not implemented yet, this will cause problom.",
system_literal
);
}
let include = if let Some(ref path) = path {
let path: &Path = path.as_ref();
if let Some(_ext) = path.extension() {
path.canonicalize()
.unwrap()
.with_file_name(system_literal.as_ref())
} else {
path.canonicalize().unwrap().join(system_literal.as_ref())
}
} else {
PathBuf::from(system_literal.as_ref())
};
match std::fs::read_to_string(&include) {
Err(err) => {
eprintln!(
"ERROR: Failed to include ExternalID PubidLiteral SystemLiteral(`{}`), {}",
system_literal,
&err
);
}
Ok(included) => definitions.extend(
resolve_entity_definitions::<PathBuf, Option<PathBuf>>(
Span::new_extra(&included, i.extra),
include.into(),
)
.into_iter(),
),
}
continue;
}
},
}
}
definitions
}
pub fn resolve_references(i: Span, definitions: &IndexMap<String, String>) -> String {
iterator(
i,
alt((
map(delimited(tag("%"), name, tag(";")), Either::Left),
map(recognize(anychar), Either::Right),
)),
)
.map(|either| match either {
Either::Left(name) => match definitions.get(name.as_ref()) {
Some(definition) => definition.as_str(),
None => {
eprintln!("ERROR: PEReference(`{}`) is not defined yet.", &name);
""
}
},
Either::Right(chars) => *chars,
})
.collect::<Vec<_>>()
.join("")
}
pub fn parse<F: AsRef<Path>>(f: F) -> std::result::Result<Vec<ElementType>, String> {
let f = f.as_ref();
let content =
std::fs::read_to_string(f).expect(&format!("Can not read from file {}", f.display()));
let tracer = TracableInfo::new().fold("entity-resolver");
let span = LocatedSpan::new_extra(content.as_str(), tracer);
let definitions = resolve_entity_definitions::<&Path, Option<&Path>>(span, f.into());
let span = LocatedSpan::new_extra(content.as_str(), tracer);
let resolved = resolve_references(span, &definitions);
let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
#[cfg(feature = "trace")]
histogram();
#[cfg(feature = "trace")]
cumulative_histogram();
let result = terminated(
many0(alt((
map(
delimited(multispace0, attlist::attlist_decl, multispace0),
ElementType::Attlist,
),
map(
delimited(multispace0, element::element_decl, multispace0),
ElementType::Element,
),
map(
delimited(multispace0, entity::entity_decl, multispace0),
ElementType::Entity,
),
map(
delimited(multispace0, comment_decl, multispace0),
ElementType::Comment,
),
))),
eof,
)(span)
.finish()
.map(|(_, definitions)| definitions)
.map_err(|err| convert_error(span, err));
result
}
pub fn parse_str(i: &str) -> std::result::Result<Vec<ElementType>, String> {
let tracer = TracableInfo::new().fold("entity-resolver");
let span = LocatedSpan::new_extra(i, tracer);
let definitions = resolve_entity_definitions::<&str, Option<&str>>(span, None);
let span = LocatedSpan::new_extra(i, tracer);
let resolved = resolve_references(span, &definitions);
let span = LocatedSpan::new_extra(resolved.as_str(), tracer);
#[cfg(feature = "trace")]
histogram();
#[cfg(feature = "trace")]
cumulative_histogram();
let result = terminated(
many0(alt((
map(
delimited(multispace0, attlist::attlist_decl, multispace0),
ElementType::Attlist,
),
map(
delimited(multispace0, element::element_decl, multispace0),
ElementType::Element,
),
map(
delimited(multispace0, entity::entity_decl, multispace0),
ElementType::Entity,
),
map(
delimited(multispace0, comment_decl, multispace0),
ElementType::Comment,
),
))),
eof,
)(span)
.finish()
.map(|(_, elements)| elements)
.map_err(|err| convert_error(span, err));
result
}
#[cfg(test)]
mod tests {
use nom::Finish;
use super::{comment_decl, pereference, span};
#[test]
fn test_comment_decl() {
let result = comment_decl(span(
r#"<!--
======================================================================
Docutils Generic DTD
======================================================================
:Author: David Goodger
:Contact: docutils-develop@lists.sourceforge.net
:Revision: $Revision: 8767 $
:Date: $Date: 2021-06-17 16:33:28 +0200 (Do, 17. Jun 2021) $
:Copyright: This DTD has been placed in the public domain.
:Filename: docutils.dtd
More information about this DTD (document type definition) and the
Docutils project can be found at http://docutils.sourceforge.net/.
The latest version of this DTD is available from
http://docutils.sourceforge.net/docs/ref/docutils.dtd.
The formal public identifier for this DTD is::
+//IDN docutils.sourceforge.net//DTD Docutils Generic//EN//XML
-->"#,
))
.finish();
assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
}
#[test]
fn test_pereference() {
let result = pereference(span("%align-h.att;")).finish();
assert!(result.is_ok(), "{:?}", result.as_ref().unwrap_err());
}
}
#[cfg(test)]
#[macro_export]
macro_rules! assert_ok {
($span:ident, $res:expr) => {
match $res {
Ok(_) => {
assert!(true);
}
Err(err) => {
assert!(false, "{}", ::nom_greedyerror::convert_error($span, err));
}
}
};
}