mod attribute;
mod cdata;
mod comment;
mod dtd;
mod element;
mod entity;
mod literal;
mod names;
mod pi;
mod reference;
mod xmldecl;
use std::{borrow::Cow, mem::take};
pub(crate) use attribute::*;
pub use dtd::*;
pub(crate) use entity::*;
#[cfg(feature = "html")]
use crate::html::parser::{HtmlParserOption, html_create_memory_parser_ctxt, html_parse_content};
use crate::{
chvalid::XmlCharValid,
encoding::{XmlCharEncoding, detect_encoding, find_encoding_handler},
error::XmlParserErrors,
globals::GenericErrorContext,
parser::XmlParserOption,
tree::{
NodeCommon, XML_XML_NAMESPACE, XmlDocProperties, XmlDocPtr, XmlElementType,
XmlGenericNodePtr, XmlNodePtr, xml_free_doc, xml_free_node, xml_free_node_list,
xml_new_doc, xml_new_doc_comment, xml_new_doc_node,
},
};
use super::{
XML_SKIP_IDS, XmlParserCtxt, XmlParserInputState, XmlSAXLocator, xml_fatal_err,
xml_fatal_err_msg, xml_init_parser,
};
pub(crate) const XML_DEFAULT_VERSION: &str = "1.0";
pub(crate) const SAX_COMPAT_MODE: &str = "SAX compatibility mode document";
impl XmlParserCtxt<'_> {
#[doc(alias = "xmlParseDocument")]
pub fn parse_document(&mut self) -> i32 {
unsafe {
xml_init_parser();
if self.input().is_none() {
return -1;
}
self.grow();
self.detect_sax2();
if let Some(sax) = self.sax.as_deref_mut() {
if let Some(set_document_locator) = sax.set_document_locator {
set_document_locator(self, XmlSAXLocator::default());
}
}
if matches!(self.instate, XmlParserInputState::XmlParserEOF) {
return -1;
}
if self.encoding().is_none() && self.input().unwrap().remainder_len() >= 4 {
let enc = detect_encoding(&self.content_bytes()[..4]);
if !matches!(enc, XmlCharEncoding::None) {
self.switch_encoding(enc);
}
}
self.grow();
if self.content_bytes().starts_with(b"<?xml") && self.nth_byte(5).is_xml_blank_char() {
self.parse_xmldecl();
if self.err_no == XmlParserErrors::XmlErrUnsupportedEncoding as i32
|| matches!(self.instate, XmlParserInputState::XmlParserEOF)
{
return -1;
}
self.standalone = self.input().unwrap().standalone;
self.skip_blanks();
} else {
self.version = Some(XML_DEFAULT_VERSION.to_owned());
}
if !self.disable_sax {
if let Some(start_document) =
self.sax.as_deref_mut().and_then(|sax| sax.start_document)
{
start_document(self);
}
}
if matches!(self.instate, XmlParserInputState::XmlParserEOF) {
return -1;
}
self.parse_misc();
self.grow();
if self.content_bytes().starts_with(b"<!DOCTYPE") {
self.in_subset = 1;
self.parse_doctypedecl();
if self.current_byte() == b'[' {
self.instate = XmlParserInputState::XmlParserDTD;
self.parse_internal_subset();
if matches!(self.instate, XmlParserInputState::XmlParserEOF) {
return -1;
}
}
self.in_subset = 2;
if !self.disable_sax {
if let Some(external_subset) =
self.sax.as_deref_mut().and_then(|sax| sax.external_subset)
{
external_subset(
self,
self.int_sub_name.clone().as_deref(),
self.ext_sub_system.clone().as_deref(),
self.ext_sub_uri.clone().as_deref(),
);
}
}
if matches!(self.instate, XmlParserInputState::XmlParserEOF) {
return -1;
}
self.in_subset = 0;
self.clean_special_attr();
self.instate = XmlParserInputState::XmlParserProlog;
self.parse_misc();
}
self.grow();
if self.current_byte() != b'<' {
xml_fatal_err_msg(
self,
XmlParserErrors::XmlErrDocumentEmpty,
"Start tag expected, '<' not found\n",
);
} else {
self.instate = XmlParserInputState::XmlParserContent;
self.parse_element();
self.instate = XmlParserInputState::XmlParserEpilog;
self.parse_misc();
if self.current_byte() != 0 {
xml_fatal_err(self, XmlParserErrors::XmlErrDocumentEnd, None);
}
self.instate = XmlParserInputState::XmlParserEOF;
}
if let Some(end_document) = self.sax.as_deref_mut().and_then(|sax| sax.end_document) {
end_document(self);
}
if let Some(my_doc) = self
.my_doc
.take_if(|doc| doc.version.as_deref() == Some(SAX_COMPAT_MODE))
{
xml_free_doc(my_doc);
}
if self.well_formed {
if let Some(mut my_doc) = self.my_doc {
my_doc.properties |= XmlDocProperties::XmlDocWellformed as i32;
if self.valid != 0 {
my_doc.properties |= XmlDocProperties::XmlDocDTDValid as i32;
}
if self.ns_well_formed {
my_doc.properties |= XmlDocProperties::XmlDocNsvalid as i32;
}
if self.options & XmlParserOption::XmlParseOld10 as i32 != 0 {
my_doc.properties |= XmlDocProperties::XmlDocOld10 as i32;
}
}
}
if !self.well_formed {
self.valid = 0;
return -1;
}
0
}
}
#[doc(alias = "xmlParseMisc")]
pub(crate) fn parse_misc(&mut self) {
while !matches!(self.instate, XmlParserInputState::XmlParserEOF) {
self.skip_blanks();
self.grow();
if self.content_bytes().starts_with(b"<?") {
self.parse_pi();
} else if self.content_bytes().starts_with(b"<!--") {
self.parse_comment();
} else {
break;
}
}
}
}
#[doc(alias = "xmlParseInNodeContext")]
pub unsafe fn xml_parse_in_node_context(
node: XmlGenericNodePtr,
data: &[u8],
mut options: i32,
lst: &mut Option<XmlGenericNodePtr>,
) -> XmlParserErrors {
unsafe {
let mut nsnr = 0;
let ret: XmlParserErrors;
match node.element_type() {
XmlElementType::XmlElementNode
| XmlElementType::XmlAttributeNode
| XmlElementType::XmlTextNode
| XmlElementType::XmlCDATASectionNode
| XmlElementType::XmlEntityRefNode
| XmlElementType::XmlPINode
| XmlElementType::XmlCommentNode
| XmlElementType::XmlDocumentNode
| XmlElementType::XmlHTMLDocumentNode => {}
_ => {
return XmlParserErrors::XmlErrInternalError;
}
}
let mut node = Some(node);
while let Some(now) = node.filter(|node| {
!matches!(
node.element_type(),
XmlElementType::XmlElementNode
| XmlElementType::XmlDocumentNode
| XmlElementType::XmlHTMLDocumentNode
)
}) {
node = now.parent();
}
let Some(mut node) = node else {
return XmlParserErrors::XmlErrInternalError;
};
let doc = if let Ok(doc) = XmlDocPtr::try_from(node) {
Some(doc)
} else {
node.document()
};
let Some(doc) = doc else {
return XmlParserErrors::XmlErrInternalError;
};
let ctxt = match doc.typ {
XmlElementType::XmlDocumentNode => XmlParserCtxt::from_memory(data),
#[cfg(feature = "html")]
XmlElementType::XmlHTMLDocumentNode => {
options |= HtmlParserOption::HtmlParseNoimplied as i32;
html_create_memory_parser_ctxt(data)
}
_ => return XmlParserErrors::XmlErrInternalError,
};
let Some(mut ctxt) = ctxt else {
return XmlParserErrors::XmlErrNoMemory;
};
options |= XmlParserOption::XmlParseNoDict as i32;
if let Some(encoding) = doc.encoding.as_deref() {
ctxt.encoding = Some(encoding.to_owned());
if let Some(handler) = find_encoding_handler(encoding) {
ctxt.switch_to_encoding(handler);
} else {
return XmlParserErrors::XmlErrUnsupportedEncoding;
}
}
ctxt.use_options_internal(options, None);
ctxt.detect_sax2();
ctxt.my_doc = Some(doc);
ctxt.input_id = 2;
ctxt.instate = XmlParserInputState::XmlParserContent;
let Some(mut fake) = xml_new_doc_comment(node.document(), "") else {
return XmlParserErrors::XmlErrNoMemory;
};
node.add_child(fake.into());
if let Ok(node) = XmlNodePtr::try_from(node) {
ctxt.node_push(node);
let mut cur = Some(node);
while let Some(now) =
cur.filter(|cur| cur.element_type() == XmlElementType::XmlElementNode)
{
let mut ns = now.ns_def;
while let Some(cur_ns) = ns {
if ctxt.get_namespace(cur_ns.prefix().as_deref()).is_none() {
ctxt.ns_push(cur_ns.prefix().as_deref(), &cur_ns.href().unwrap());
nsnr += 1;
}
ns = cur_ns.next;
}
cur = now.parent.and_then(|p| XmlNodePtr::try_from(p).ok());
}
}
if ctxt.validate || ctxt.replace_entities {
ctxt.loadsubset |= XML_SKIP_IDS as i32;
}
#[cfg(feature = "html")]
{
if doc.typ == XmlElementType::XmlHTMLDocumentNode {
html_parse_content(&mut ctxt);
} else {
ctxt.parse_content();
}
}
#[cfg(not(feature = "html"))]
{
ctxt.parse_content();
}
ctxt.ns_pop(nsnr);
if ctxt.current_byte() == b'<' && ctxt.nth_byte(1) == b'/' {
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrNotWellBalanced, None);
} else if ctxt.current_byte() != 0 {
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrExtraContent, None);
}
if ctxt
.node
.is_some_and(|ctxt_node| XmlGenericNodePtr::from(ctxt_node) != node)
{
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrNotWellBalanced, None);
ctxt.well_formed = false;
}
if !ctxt.well_formed {
if ctxt.err_no == 0 {
ret = XmlParserErrors::XmlErrInternalError;
} else {
ret = XmlParserErrors::try_from(ctxt.err_no).unwrap();
}
} else {
ret = XmlParserErrors::XmlErrOK;
}
let mut cur = fake.next.take();
node.set_last(Some(fake.into()));
if let Some(mut cur) = cur {
cur.set_prev(None);
}
*lst = cur;
while let Some(mut now) = cur {
now.set_parent(None);
cur = now.next();
}
fake.unlink();
xml_free_node(fake);
if !matches!(ret, XmlParserErrors::XmlErrOK) {
xml_free_node_list(lst.take());
}
ret
}
}
#[doc(alias = "xmlParseBalancedChunkMemoryInternal")]
pub(crate) fn xml_parse_balanced_chunk_memory_internal(
oldctxt: &mut XmlParserCtxt,
string: &[u8],
user_data: Option<GenericErrorContext>,
mut lst: Option<&mut Option<XmlGenericNodePtr>>,
) -> XmlParserErrors {
unsafe {
let mut content = None;
let mut last = None;
let ret: XmlParserErrors;
if (oldctxt.depth > 40 && oldctxt.options & XmlParserOption::XmlParseHuge as i32 == 0)
|| oldctxt.depth > 100
{
xml_fatal_err_msg(
oldctxt,
XmlParserErrors::XmlErrEntityLoop,
"Maximum entity nesting depth exceeded",
);
return XmlParserErrors::XmlErrEntityLoop;
}
if let Some(lst) = lst.as_mut() {
**lst = None;
}
let Some(mut ctxt) = XmlParserCtxt::from_memory(string) else {
return XmlParserErrors::XmlWarUndeclaredEntity;
};
ctxt.nb_errors = oldctxt.nb_errors;
ctxt.nb_warnings = oldctxt.nb_warnings;
ctxt.user_data = user_data;
ctxt.input_id = oldctxt.input_id;
ctxt.str_xml = Some(Cow::Borrowed("xml"));
ctxt.str_xmlns = Some(Cow::Borrowed("xmlns"));
ctxt.str_xml_ns = Some(Cow::Borrowed(XML_XML_NAMESPACE));
for (pre, loc) in &oldctxt.ns_tab {
ctxt.ns_push(pre.as_deref(), loc);
}
let oldsax = ctxt.sax.take();
ctxt.sax = oldctxt.sax.take();
ctxt.detect_sax2();
ctxt.replace_entities = oldctxt.replace_entities;
ctxt.options = oldctxt.options;
ctxt._private = oldctxt._private;
let mut new_doc = None;
let mut my_doc = if let Some(my_doc) = oldctxt.my_doc {
ctxt.my_doc = Some(my_doc);
content = my_doc.children;
last = my_doc.last;
my_doc
} else {
let Some(mut new) = xml_new_doc(Some("1.0")) else {
oldctxt.sax = ctxt.sax.take();
ctxt.sax = oldsax;
return XmlParserErrors::XmlErrInternalError;
};
new_doc = Some(new);
new.properties = XmlDocProperties::XmlDocInternal as i32;
ctxt.my_doc = Some(new);
new
};
let Some(new_root) = xml_new_doc_node(ctxt.my_doc, None, "pseudoroot", None) else {
oldctxt.sax = ctxt.sax.take();
ctxt.sax = oldsax;
if let Some(new_doc) = new_doc {
xml_free_doc(new_doc);
}
return XmlParserErrors::XmlErrInternalError;
};
my_doc.children = None;
my_doc.last = None;
my_doc.add_child(new_root.into());
ctxt.node_push(
my_doc
.children
.map(|c| XmlNodePtr::try_from(c).unwrap())
.unwrap(),
);
ctxt.instate = XmlParserInputState::XmlParserContent;
ctxt.depth = oldctxt.depth;
ctxt.validate = false;
ctxt.loadsubset = oldctxt.loadsubset;
if oldctxt.validate || oldctxt.replace_entities {
ctxt.loadsubset |= XML_SKIP_IDS as i32;
}
ctxt.atts_default = take(&mut oldctxt.atts_default);
ctxt.atts_special = take(&mut oldctxt.atts_special);
ctxt.parse_content();
if ctxt.content_bytes().starts_with(b"</") {
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrNotWellBalanced, None);
} else if ctxt.current_byte() != 0 {
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrExtraContent, None);
}
if my_doc.children != ctxt.node.map(|node| node.into()) {
xml_fatal_err(&mut ctxt, XmlParserErrors::XmlErrNotWellBalanced, None);
}
if !ctxt.well_formed {
ret = XmlParserErrors::try_from(ctxt.err_no).unwrap();
oldctxt.err_no = ctxt.err_no;
oldctxt.well_formed = false;
oldctxt.last_error = ctxt.last_error.clone();
} else {
ret = XmlParserErrors::XmlErrOK;
}
if let Some(lst) = lst {
if matches!(ret, XmlParserErrors::XmlErrOK) {
let mut cur = my_doc.children().unwrap().children();
*lst = cur;
while let Some(mut now) = cur {
#[cfg(feature = "libxml_valid")]
if oldctxt.validate
&& oldctxt.well_formed
&& now.element_type() == XmlElementType::XmlElementNode
{
if let Some(my_doc) = oldctxt.my_doc.filter(|doc| doc.int_subset.is_some())
{
oldctxt.valid &= oldctxt.validate_element(my_doc, cur);
}
}
now.set_parent(None);
cur = now.next();
}
my_doc.children().unwrap().set_children(None);
}
}
if let Some(mut my_doc) = ctxt.my_doc {
xml_free_node(my_doc.children().unwrap());
my_doc.children = content;
my_doc.last = last;
}
if ctxt.input().is_some() {
let mut consumed: u64 = ctxt.input().unwrap().consumed;
consumed = consumed.saturating_add(ctxt.input().unwrap().offset_from_base() as u64);
oldctxt.sizeentcopy = oldctxt.sizeentcopy.saturating_add(consumed);
oldctxt.sizeentcopy = oldctxt.sizeentcopy.saturating_add(ctxt.sizeentcopy);
}
oldctxt.nb_errors = ctxt.nb_errors;
oldctxt.nb_warnings = ctxt.nb_warnings;
oldctxt.sax = ctxt.sax.take();
ctxt.sax = oldsax;
ctxt.atts_default.clear();
ctxt.atts_special.clear();
if let Some(new_doc) = new_doc {
xml_free_doc(new_doc);
}
ret
}
}