use std::borrow::Cow;
use std::fmt;
use crate::marked_sections::MarkedSectionStatus;
use crate::{entities, text, SgmlFragment};
mod error;
pub mod events;
pub mod raw;
pub mod util;
pub use error::*;
pub fn parse(input: &str) -> crate::Result<SgmlFragment> {
Parser::new().parse(input)
}
#[derive(Debug, Default)]
pub struct Parser {
config: ParserConfig,
}
impl Parser {
pub fn new() -> Self {
Default::default()
}
pub fn builder() -> ParserBuilder {
ParserBuilder::new()
}
pub fn parse<'a>(&self, input: &'a str) -> crate::Result<SgmlFragment<'a>> {
self.parse_with_detailed_errors::<ContextualizedError<_>>(input)
.map_err(|err| crate::Error::ParseError(err.describe(&input)))
}
pub fn parse_with_detailed_errors<'a, E>(&self, input: &'a str) -> Result<SgmlFragment<'a>, E>
where
E: nom::error::ParseError<&'a str>
+ nom::error::ContextError<&'a str>
+ nom::error::FromExternalError<&'a str, crate::Error>,
{
use nom::Finish;
let (rest, events) = events::document_entity::<E>(input, &self.config).finish()?;
debug_assert!(rest.is_empty(), "document_entity should be all_consuming");
let events = events.collect::<Vec<_>>();
Ok(SgmlFragment::from(events))
}
}
pub struct ParserConfig {
pub trim_whitespace: bool,
pub name_normalization: NameNormalization,
pub marked_section_handling: MarkedSectionHandling,
pub ignore_markup_declarations: bool,
pub ignore_processing_instructions: bool,
entity_fn: Option<EntityFn>,
parameter_entity_fn: Option<EntityFn>,
}
type EntityFn = Box<dyn Fn(&str) -> Option<Cow<'static, str>>>;
impl ParserConfig {
pub fn trim<'a>(&self, text: &'a str) -> &'a str {
if self.trim_whitespace {
text.trim_matches(text::is_sgml_whitespace)
} else {
text
}
}
pub fn parse_rcdata<'a, E>(&self, rcdata: &'a str) -> Result<Cow<'a, str>, nom::Err<E>>
where
E: nom::error::ContextError<&'a str> + nom::error::FromExternalError<&'a str, crate::Error>,
{
let f = self.entity_fn.as_deref().unwrap_or(&|_| None);
entities::expand_entities(rcdata, f).map_err(|err| into_nom_failure(rcdata, err))
}
pub fn parse_markup_declaration_text<'a, E>(
&self,
text: &'a str,
) -> Result<Cow<'a, str>, nom::Err<E>>
where
E: nom::error::ContextError<&'a str> + nom::error::FromExternalError<&'a str, crate::Error>,
{
let f = self.parameter_entity_fn.as_deref().unwrap_or(&|_| None);
entities::expand_parameter_entities(text, f).map_err(|err| into_nom_failure(text, err))
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum NameNormalization {
Unchanged,
ToLowercase,
ToUppercase,
}
impl Default for NameNormalization {
fn default() -> Self {
NameNormalization::Unchanged
}
}
impl NameNormalization {
pub fn normalize<'a>(&self, name: Cow<'a, str>) -> Cow<'a, str> {
match self {
NameNormalization::ToLowercase if name.chars().any(char::is_uppercase) => {
name.to_lowercase().into()
}
NameNormalization::ToUppercase if name.chars().any(char::is_lowercase) => {
name.to_uppercase().into()
}
_ => name,
}
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum MarkedSectionHandling {
KeepUnmodified,
AcceptOnlyCharacterData,
ExpandAll,
}
impl Default for MarkedSectionHandling {
fn default() -> Self {
MarkedSectionHandling::AcceptOnlyCharacterData
}
}
impl MarkedSectionHandling {
pub fn parse_keywords<'a>(
&self,
status_keywords: &'a str,
) -> Result<MarkedSectionStatus, &'a str> {
match self {
MarkedSectionHandling::AcceptOnlyCharacterData => match status_keywords.parse() {
Ok(status @ (MarkedSectionStatus::CData | MarkedSectionStatus::RcData)) => {
Ok(status)
}
_ => Err(status_keywords),
},
_ => MarkedSectionStatus::from_keywords(status_keywords),
}
}
}
fn into_nom_failure<'a, E>(input: &'a str, err: entities::EntityError) -> nom::Err<E>
where
E: nom::error::ContextError<&'a str> + nom::error::FromExternalError<&'a str, crate::Error>,
{
use nom::Slice;
let slice = input.slice(err.position.clone());
nom::Err::Error(E::add_context(
slice,
if slice.starts_with("&#") {
"character reference"
} else {
"entity"
},
E::from_external_error(slice, nom::error::ErrorKind::MapRes, err.into()),
))
}
impl Default for ParserConfig {
fn default() -> Self {
ParserConfig {
trim_whitespace: true,
name_normalization: Default::default(),
marked_section_handling: Default::default(),
ignore_markup_declarations: false,
ignore_processing_instructions: false,
entity_fn: None,
parameter_entity_fn: None,
}
}
}
impl fmt::Debug for ParserConfig {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("ParserConfig")
.field("trim_whitespace", &self.trim_whitespace)
.field("process_marked_sections", &self.marked_section_handling)
.field("expand_entity", &omit(&self.entity_fn))
.field("expand_parameter_entity", &omit(&self.parameter_entity_fn))
.finish()
}
}
#[derive(Default, Debug)]
pub struct ParserBuilder {
config: ParserConfig,
}
impl ParserBuilder {
pub fn new() -> Self {
Default::default()
}
pub fn trim_whitespace(mut self, trim_whitespace: bool) -> Self {
self.config.trim_whitespace = trim_whitespace;
self
}
pub fn name_normalization(mut self, name_normalization: NameNormalization) -> Self {
self.config.name_normalization = name_normalization;
self
}
pub fn lowercase_names(self) -> Self {
self.name_normalization(NameNormalization::ToLowercase)
}
pub fn uppercase_names(self) -> Self {
self.name_normalization(NameNormalization::ToUppercase)
}
pub fn expand_entities<F, T>(mut self, f: F) -> Self
where
F: Fn(&str) -> Option<T> + 'static,
T: Into<Cow<'static, str>>,
{
self.config.entity_fn = Some(Box::new(move |entity| f(entity).map(Into::into)));
self
}
pub fn expand_parameter_entities<F, T>(mut self, f: F) -> Self
where
F: Fn(&str) -> Option<T> + 'static,
T: Into<Cow<'static, str>>,
{
self.config.parameter_entity_fn = Some(Box::new(move |entity| f(entity).map(Into::into)));
self
}
pub fn marked_section_handling(mut self, mode: MarkedSectionHandling) -> Self {
self.config.marked_section_handling = mode;
self
}
pub fn expand_marked_sections(self) -> Self {
self.marked_section_handling(MarkedSectionHandling::ExpandAll)
}
pub fn ignore_markup_declarations(mut self, ignore: bool) -> Self {
self.config.ignore_markup_declarations = ignore;
self
}
pub fn ignore_processing_instructions(mut self, ignore: bool) -> Self {
self.config.ignore_processing_instructions = ignore;
self
}
pub fn build(self) -> Parser {
Parser {
config: self.config,
}
}
pub fn parse(self, input: &str) -> crate::Result<SgmlFragment> {
self.build().parse(input)
}
pub fn into_config(self) -> ParserConfig {
self.config
}
}
fn omit<T>(opt: &Option<T>) -> impl fmt::Debug {
opt.as_ref().map(|_| Ellipsis)
}
struct Ellipsis;
impl fmt::Debug for Ellipsis {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("...")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_trim() {
let config = ParserConfig::default();
assert_eq!(config.trim(" hello "), "hello");
let config = Parser::builder().trim_whitespace(true).into_config();
assert_eq!(config.trim(" hello "), "hello");
let config = Parser::builder().trim_whitespace(false).into_config();
assert_eq!(config.trim(" hello "), " hello ");
}
#[test]
fn test_config_parse_rcdata() {
let config = ParserConfig::default();
match config.parse_rcdata::<nom::error::Error<_>>("hello &x; world") {
Err(nom::Err::Error(err)) => assert_eq!(err.input, "&x;"),
err => panic!("expected nom::Err::Error, got: {:?}", err),
};
}
#[test]
fn test_name_normalization_unchanged() {
assert!(matches!(
NameNormalization::Unchanged.normalize("hello".into()),
Cow::Borrowed("hello")
));
assert!(matches!(
NameNormalization::Unchanged.normalize("Hello".into()),
Cow::Borrowed("Hello")
));
assert!(matches!(
NameNormalization::Unchanged.normalize("HELLO".into()),
Cow::Borrowed("HELLO")
));
assert!(matches!(
NameNormalization::Unchanged.normalize("題名".into()),
Cow::Borrowed("題名")
));
assert!(matches!(
NameNormalization::Unchanged.normalize("grüße".into()),
Cow::Borrowed("grüße")
));
}
#[test]
fn test_name_normalization_to_lowercase() {
assert!(matches!(
NameNormalization::ToLowercase.normalize("hello".into()),
Cow::Borrowed("hello")
));
assert_eq!(
NameNormalization::ToLowercase.normalize("Hello".into()),
"hello"
);
assert_eq!(
NameNormalization::ToLowercase.normalize("HELLO".into()),
"hello"
);
assert!(matches!(
NameNormalization::ToLowercase.normalize("題名".into()),
Cow::Borrowed("題名")
));
assert!(matches!(
NameNormalization::ToLowercase.normalize("grüße".into()),
Cow::Borrowed("grüße")
));
assert_eq!(
NameNormalization::ToLowercase.normalize("Grüße".into()),
"grüße"
);
}
#[test]
fn test_name_normalization_to_uppercase() {
assert!(matches!(
NameNormalization::ToUppercase.normalize("HELLO".into()),
Cow::Borrowed("HELLO")
));
assert_eq!(
NameNormalization::ToUppercase.normalize("Hello".into()),
"HELLO"
);
assert_eq!(
NameNormalization::ToUppercase.normalize("hello".into()),
"HELLO"
);
assert!(matches!(
NameNormalization::ToUppercase.normalize("題名".into()),
Cow::Borrowed("題名")
));
assert!(matches!(
NameNormalization::ToUppercase.normalize("GRÜSSE".into()),
Cow::Borrowed("GRÜSSE")
));
assert_eq!(
NameNormalization::ToUppercase.normalize("grüße".into()),
"GRÜSSE"
);
}
}