use super::preamble::{HtmlPreamble, Rule};
use super::tag_info::{HtmlTagInfo, XmlnsPositions};
use crate::ns::{NsError, NsResult};
use pest::iterators::Pair;
use pest::Parser;
pub fn parse_preamble(html: impl AsRef<str>) -> NsResult<HtmlTagInfo> {
let html = html.as_ref();
let pairs = HtmlPreamble::parse(Rule::document, html)
.map_err(|e| NsError::ParseError(format!("Failed to parse HTML: {e}")))?;
for pair in pairs {
if pair.as_rule() == Rule::document {
for inner in pair.into_inner() {
if inner.as_rule() == Rule::html_tag {
return extract_html_tag_info(inner);
}
}
}
}
Err(NsError::ParseError(
"No <html> tag found in document".to_string(),
))
}
#[inline]
fn extract_html_tag_info(html_tag: Pair<Rule>) -> NsResult<HtmlTagInfo> {
let tag_start = html_tag.as_span().start();
let tag_end = html_tag.as_span().end();
let mut tag_close_start = 0;
let mut existing_xmlns = Vec::new();
for tag_part in html_tag.into_inner() {
match tag_part.as_rule() {
Rule::attributes => {
extract_xmlns_attributes(tag_part, &mut existing_xmlns);
}
Rule::tag_close => {
tag_close_start = tag_part.as_span().start();
}
_ => {}
}
}
Ok(HtmlTagInfo {
tag_start,
tag_close_start,
tag_end,
existing_xmlns,
})
}
#[inline]
fn extract_xmlns_attributes(attributes: Pair<Rule>, existing_xmlns: &mut Vec<XmlnsPositions>) {
for attr in attributes.into_inner() {
if attr.as_rule() == Rule::attribute {
if let Some(xmlns_positions) = extract_xmlns_from_attribute(attr) {
existing_xmlns.push(xmlns_positions);
}
}
}
}
#[inline]
fn extract_xmlns_from_attribute(attr: Pair<Rule>) -> Option<XmlnsPositions> {
let mut attr_name_span = None;
let mut attr_value_span = None;
for attr_part in attr.into_inner() {
match attr_part.as_rule() {
Rule::attr_name => {
attr_name_span = Some(attr_part.as_span());
}
Rule::attr_value => {
attr_value_span = Some(extract_value_positions(attr_part));
}
_ => {}
}
}
if let (Some(name_span), Some(value_span)) = (attr_name_span, attr_value_span) {
let name = name_span.as_str();
if name.starts_with("xmlns:") {
let prefix_offset = "xmlns:".len();
let prefix_start = name_span.start() + prefix_offset;
let prefix_end = name_span.end();
return Some(((prefix_start, prefix_end), value_span));
}
}
None
}
#[inline]
fn extract_value_positions(value_pair: Pair<Rule>) -> (usize, usize) {
let span = value_pair.as_span();
let value = span.as_str();
let starts_with_quote = value.starts_with('"') || value.starts_with('\'');
let ends_with_quote = value.ends_with('"') || value.ends_with('\'');
let start_offset = if starts_with_quote { 1 } else { 0 };
let end_offset = if ends_with_quote { 1 } else { 0 };
(span.start() + start_offset, span.end() - end_offset)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_simple_html() {
let html = r#"<!DOCTYPE html>
<html lang="en">
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
let info = result.unwrap();
assert!(info.existing_xmlns.is_empty());
assert!(info.tag_start < info.tag_close_start);
assert!(info.tag_close_start < info.tag_end);
}
#[test]
fn parse_with_xmlns() {
let html = r#"<!DOCTYPE html>
<html xmlns:custom="http://example.com/ns" xmlns:other="http://other.com" lang="en">
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
let info = result.unwrap();
assert_eq!(info.existing_xmlns.len(), 2);
let ((prefix_start, prefix_end), (uri_start, uri_end)) = info.existing_xmlns[0];
assert_eq!(&html[prefix_start..prefix_end], "custom");
assert_eq!(&html[uri_start..uri_end], "http://example.com/ns");
let ((prefix_start, prefix_end), (uri_start, uri_end)) = info.existing_xmlns[1];
assert_eq!(&html[prefix_start..prefix_end], "other");
assert_eq!(&html[uri_start..uri_end], "http://other.com");
}
#[test]
fn parse_with_comment() {
let html = r#"<!-- This has <html> in it -->
<!DOCTYPE html>
<html>
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
}
#[test]
fn parse_missing_html_tag() {
let html = r#"<!DOCTYPE html>
<body>Hello</body>"#;
let result = parse_preamble(html);
assert!(result.is_err());
}
#[test]
fn parse_with_pi() {
let html = r#"<?xml version="1.0"?>
<!DOCTYPE html>
<html>
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
}
#[test]
fn parse_multiple_xmlns() {
let html = r#"<html xmlns:svg="http://www.w3.org/2000/svg" xmlns:math="http://www.w3.org/1998/Math/MathML">
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
let info = result.unwrap();
assert_eq!(info.xmlns_count(), 2);
let prefix1 = info.get_prefix(0, html).unwrap();
let uri1 = info.get_uri(0, html).unwrap();
assert_eq!(prefix1, "svg");
assert_eq!(uri1, "http://www.w3.org/2000/svg");
let prefix2 = info.get_prefix(1, html).unwrap();
let uri2 = info.get_uri(1, html).unwrap();
assert_eq!(prefix2, "math");
assert_eq!(uri2, "http://www.w3.org/1998/Math/MathML");
}
#[test]
fn parse_single_quoted_attribute() {
let html = r#"<html xmlns:custom='http://example.com/single' lang='en'>
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_ok());
let info = result.unwrap();
assert_eq!(info.xmlns_count(), 1);
let uri = info.get_uri(0, html).unwrap();
assert_eq!(uri, "http://example.com/single");
assert!(!uri.contains('\''));
}
#[test]
fn parse_malformed_html_tag() {
let html = r#"<!DOCTYPE html>
<html xmlns:broken="unclosed
<body>Hello</body>
</html>"#;
let result = parse_preamble(html);
assert!(result.is_err());
match result {
Err(NsError::ParseError(msg)) => {
assert!(msg.contains("Failed to parse HTML"));
}
_ => panic!("Expected ParseError"),
}
}
}