#![allow(
clippy::too_many_arguments,
clippy::too_many_lines,
clippy::trivially_copy_pass_by_ref,
clippy::items_after_statements
)]
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use crate::converter::dom_context::DomContext;
use crate::converter::main_helpers::{
collapse_excess_blank_lines, extract_head_metadata, format_metadata_frontmatter,
has_custom_element_tags, repair_with_html5ever, trim_line_end_whitespace,
trim_trailing_whitespace,
};
use crate::converter::plain_text::extract_plain_text;
use crate::converter::preprocessing_helpers::{
has_inline_block_misnest, should_drop_for_preprocessing,
};
use crate::converter::utility::caching::build_dom_context;
use crate::converter::utility::content::normalized_tag_name;
use crate::converter::utility::preprocessing::{
normalize_bogus_comment_endings, normalize_split_closing_tags, normalize_unclosed_list_items,
preprocess_html, strip_hidden_elements, strip_script_and_style_tags,
};
use crate::converter::utility::serialization::serialize_tag_to_html;
use crate::options::OutputFormat;
use crate::converter::handlers::{
handle_blockquote, handle_code, handle_graphic, handle_img, handle_link, handle_pre,
};
use crate::error::Result;
use crate::options::ConversionOptions;
use crate::converter::context::{Context, InlineCollectorHandle};
use crate::types::structure_collector::StructureCollectorHandle;
struct PreprocessedHtml<'a> {
html: Cow<'a, str>,
}
impl<'a> PreprocessedHtml<'a> {
fn new(html: &'a str) -> Self {
Self {
html: preprocess_for_tier2(Cow::Borrowed(html)),
}
}
fn from_repaired(repaired_html: String) -> Self {
Self {
html: preprocess_for_tier2(Cow::Owned(repaired_html)),
}
}
fn as_str(&self) -> &str {
self.html.as_ref()
}
fn len(&self) -> usize {
self.html.len()
}
}
fn preprocess_for_tier2(input: Cow<'_, str>) -> Cow<'_, str> {
let input = apply_preprocess(input, strip_script_and_style_tags);
let input = apply_preprocess(input, strip_hidden_elements);
let input = apply_preprocess(input, normalize_bogus_comment_endings);
let input = apply_preprocess(input, normalize_split_closing_tags);
let input = apply_preprocess(input, normalize_unclosed_list_items);
apply_preprocess(input, preprocess_html)
}
fn apply_preprocess<'a, F>(input: Cow<'a, str>, f: F) -> Cow<'a, str>
where
F: for<'b> FnOnce(&'b str) -> Cow<'b, str>,
{
match input {
Cow::Borrowed(borrowed) => match f(borrowed) {
Cow::Borrowed(_) => Cow::Borrowed(borrowed),
Cow::Owned(owned) => Cow::Owned(owned),
},
Cow::Owned(owned) => match f(&owned) {
Cow::Borrowed(_) => Cow::Owned(owned),
Cow::Owned(next) => Cow::Owned(next),
},
}
}
#[cfg_attr(
any(
not(feature = "inline-images"),
not(feature = "metadata"),
not(feature = "visitor")
),
allow(unused_variables)
)]
pub fn convert_html_impl(
html: &str,
options: &ConversionOptions,
inline_collector: Option<InlineCollectorHandle>,
#[cfg(feature = "metadata")] metadata_collector: Option<
crate::metadata::MetadataCollectorHandle,
>,
#[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
#[cfg(feature = "visitor")] visitor: Option<crate::visitor::VisitorHandle>,
#[cfg(not(feature = "visitor"))] _visitor: Option<()>,
structure_collector: Option<StructureCollectorHandle>,
) -> Result<(
String,
Option<crate::types::DocumentStructure>,
Vec<crate::types::TableData>,
)> {
let mut preprocessed = PreprocessedHtml::new(html);
if has_custom_element_tags(preprocessed.as_str())
&& let Some(repaired_html) = repair_with_html5ever(preprocessed.as_str())
{
preprocessed = PreprocessedHtml::from_repaired(repaired_html);
}
let parser_options = tl::ParserOptions::default();
let mut dom = loop {
if let Ok(dom) = tl::parse(preprocessed.as_str(), parser_options) {
break dom;
}
if let Some(repaired_html) = repair_with_html5ever(preprocessed.as_str()) {
preprocessed = PreprocessedHtml::from_repaired(repaired_html);
continue;
}
return Err(crate::error::ConversionError::ParseError(
"Failed to parse HTML".to_string(),
));
};
let mut parser = dom.parser();
let mut output =
String::with_capacity(preprocessed.len().saturating_add(preprocessed.len() / 4));
let mut dom_ctx = build_dom_context(&dom, parser, preprocessed.len());
if has_inline_block_misnest(&dom_ctx, parser)
&& let Some(repaired_html) = repair_with_html5ever(preprocessed.as_str())
{
drop(dom);
preprocessed = PreprocessedHtml::from_repaired(repaired_html);
dom = tl::parse(preprocessed.as_str(), parser_options).map_err(|_| {
crate::error::ConversionError::ParseError("Failed to parse repaired HTML".to_string())
})?;
parser = dom.parser();
dom_ctx = build_dom_context(&dom, parser, preprocessed.len());
output = String::with_capacity(preprocessed.len().saturating_add(preprocessed.len() / 4));
}
let is_plain_text = options.output_format == OutputFormat::Plain;
let wants_frontmatter = options.extract_metadata && !options.convert_as_inline;
#[cfg(feature = "metadata")]
let wants_document = metadata_collector
.as_ref()
.is_some_and(|collector| collector.borrow().wants_document());
#[cfg(not(feature = "metadata"))]
let wants_document = false;
if wants_frontmatter || wants_document {
let mut head_metadata: Option<BTreeMap<String, String>> = None;
#[cfg(feature = "metadata")]
let mut document_lang: Option<String> = None;
#[cfg(feature = "metadata")]
let mut document_dir: Option<String> = None;
for child_handle in dom.children() {
if head_metadata.is_none() {
let metadata = extract_head_metadata(child_handle, parser, options);
if !metadata.is_empty() {
head_metadata = Some(metadata);
}
}
#[cfg(feature = "metadata")]
if wants_document && let Some(tl::Node::Tag(tag)) = child_handle.get(parser) {
let tag_name = tag.name().as_utf8_str();
if tag_name == "html" || tag_name == "body" {
if document_lang.is_none()
&& let Some(Some(lang_bytes)) = tag.attributes().get("lang")
{
document_lang = Some(lang_bytes.as_utf8_str().to_string());
}
if document_dir.is_none()
&& let Some(Some(dir_bytes)) = tag.attributes().get("dir")
{
document_dir = Some(dir_bytes.as_utf8_str().to_string());
}
}
}
}
if wants_frontmatter
&& let Some(metadata) = head_metadata.as_ref()
&& !metadata.is_empty()
{
let metadata_frontmatter = format_metadata_frontmatter(metadata);
output.push_str(&metadata_frontmatter);
}
#[cfg(feature = "metadata")]
if wants_document && let Some(ref collector) = metadata_collector {
if let Some(metadata) = head_metadata
&& !metadata.is_empty()
{
collector.borrow_mut().set_head_metadata(metadata);
}
if let Some(lang) = document_lang {
collector.borrow_mut().set_language(lang);
}
if let Some(dir) = document_dir {
collector.borrow_mut().set_text_direction(dir);
}
}
}
let reference_collector = if options.link_style == crate::options::LinkStyle::Reference {
Some(std::rc::Rc::new(std::cell::RefCell::new(
crate::converter::reference_collector::ReferenceCollector::new(),
)))
} else {
None
};
#[cfg(all(feature = "metadata", feature = "visitor"))]
let mut ctx = Context::new(
options,
inline_collector,
metadata_collector,
visitor,
structure_collector.as_ref().map(std::rc::Rc::clone),
reference_collector.as_ref().map(std::rc::Rc::clone),
);
#[cfg(all(feature = "metadata", not(feature = "visitor")))]
let mut ctx = Context::new(
options,
inline_collector,
metadata_collector,
_visitor,
structure_collector.as_ref().map(std::rc::Rc::clone),
reference_collector.as_ref().map(std::rc::Rc::clone),
);
#[cfg(all(not(feature = "metadata"), feature = "visitor"))]
let mut ctx = Context::new(
options,
inline_collector,
_metadata_collector,
visitor,
structure_collector.as_ref().map(std::rc::Rc::clone),
reference_collector.as_ref().map(std::rc::Rc::clone),
);
#[cfg(all(not(feature = "metadata"), not(feature = "visitor")))]
let mut ctx = Context::new(
options,
inline_collector,
_metadata_collector,
_visitor,
structure_collector.as_ref().map(std::rc::Rc::clone),
reference_collector.as_ref().map(std::rc::Rc::clone),
);
if !options.exclude_selectors.is_empty() {
let mut excluded: HashSet<u32> = HashSet::new();
for selector in &options.exclude_selectors {
if let Some(iter) = dom.query_selector(selector) {
for handle in iter {
excluded.insert(handle.get_inner());
}
}
}
ctx.set_excluded_node_ids(excluded);
}
for child_handle in dom.children() {
walk_node(
child_handle,
parser,
&mut output,
options,
&ctx,
0,
&dom_ctx,
);
}
#[cfg(feature = "visitor")]
if let Some(err) = ctx.visitor_error.borrow().as_ref() {
return Err(crate::error::ConversionError::Visitor(err.clone()));
}
drop(ctx);
if let Some(rc) = reference_collector
&& let Ok(collector) = std::rc::Rc::try_unwrap(rc)
{
let ref_section = collector.into_inner().finish();
if !ref_section.is_empty() {
let trimmed_len = output.trim_end_matches('\n').len();
output.truncate(trimmed_len);
output.push_str("\n\n");
output.push_str(&ref_section);
}
}
let output = if is_plain_text {
extract_plain_text(&dom, parser, options)
} else {
trim_line_end_whitespace(&mut output);
collapse_excess_blank_lines(&mut output);
output
};
let (document, tables) = finish_structure_collector(structure_collector);
Ok((output, document, tables))
}
fn finish_structure_collector(
sc: Option<StructureCollectorHandle>,
) -> (
Option<crate::types::DocumentStructure>,
Vec<crate::types::TableData>,
) {
match sc.and_then(|rc| std::rc::Rc::try_unwrap(rc).ok()) {
Some(cell) => {
let (doc, tables) = cell.into_inner().finish();
(Some(doc), tables)
}
None => (None, Vec::new()),
}
}
pub fn walk_node(
node_handle: &tl::NodeHandle,
parser: &crate::tl_types::Parser,
output: &mut String,
options: &ConversionOptions,
ctx: &Context,
depth: usize,
dom_ctx: &DomContext,
) {
let Some(node) = node_handle.get(parser) else {
return;
};
if let Some(max) = options.max_depth
&& depth >= max
{
return;
}
match node {
tl::Node::Raw(bytes) => {
let raw = bytes.as_utf8_str();
crate::converter::text_node::process_text_node(
raw.as_ref(),
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
tl::Node::Tag(tag) => {
let tag_name = match dom_ctx.tag_info(node_handle.get_inner(), parser) {
Some(info) => Cow::Borrowed(info.name.as_str()),
None => normalized_tag_name(tag.name().as_utf8_str()),
};
#[cfg(feature = "visitor")]
if let Some(ref visitor_handle) = ctx.visitor {
use crate::converter::visitor_hooks::{VisitAction, handle_visitor_element_start};
let action = handle_visitor_element_start(
visitor_handle,
tag_name.as_ref(),
node_handle,
tag,
parser,
output,
ctx,
depth,
dom_ctx,
);
match action {
VisitAction::Continue => {}
VisitAction::Skip => return,
VisitAction::Custom => return,
VisitAction::Error => return,
}
}
#[cfg(feature = "visitor")]
let visitor_is_active = ctx.visitor.is_some();
#[cfg(not(feature = "visitor"))]
let visitor_is_active = false;
if !visitor_is_active && should_drop_for_preprocessing(tag_name.as_ref(), tag, options)
{
trim_trailing_whitespace(output);
return;
}
if !ctx.excluded_node_ids.is_empty()
&& ctx.excluded_node_ids.contains(&node_handle.get_inner())
{
trim_trailing_whitespace(output);
return;
}
if ctx.strip_tags.contains(tag_name.as_ref()) {
let children = tag.children();
{
for child_handle in children.top().iter() {
walk_node(
child_handle,
parser,
output,
options,
ctx,
depth + 1,
dom_ctx,
);
}
}
return;
}
if ctx.preserve_tags.contains(tag_name.as_ref()) {
let html = serialize_tag_to_html(node_handle, parser);
output.push_str(&html);
return;
}
#[cfg(feature = "metadata")]
if matches!(tag_name.as_ref(), "html" | "head" | "body")
&& ctx.metadata_wants_document
&& let Some(ref collector) = ctx.metadata_collector
{
let mut c = collector.borrow_mut();
if let Some(lang) = tag.attributes().get("lang").flatten() {
c.set_language(lang.as_utf8_str().to_string());
}
if let Some(dir) = tag.attributes().get("dir").flatten() {
c.set_text_direction(dir.as_utf8_str().to_string());
}
}
#[cfg_attr(not(feature = "visitor"), allow(unused_variables))]
let element_output_start = output.len();
match tag_name.as_ref() {
"h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
crate::converter::block::heading::handle(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"p" => {
crate::converter::block::paragraph::handle(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"strong" | "b" | "em" | "i" | "mark" | "del" | "s" | "ins" | "u" | "small"
| "sub" | "sup" | "kbd" | "samp" | "var" | "dfn" | "abbr" | "ruby" | "rb"
| "rt" | "rp" | "rtc" | "span" => {
crate::converter::inline::dispatch_inline_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"a" => handle_link(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"img" => handle_img(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"graphic" => handle_graphic(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"code" => handle_code(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"pre" => handle_pre(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"blockquote" => handle_blockquote(
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"time" | "data" => {
crate::converter::block::container::handle_passthrough(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"wbr" | "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" | "source" => {
crate::converter::block::container::handle_noop(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"br" => crate::converter::block::line_break::handle(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"hr" => crate::converter::block::horizontal_rule::handle(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"div" => {
crate::converter::block::div::handle(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"caption" => crate::converter::block::table::handle_caption(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
),
"table" => crate::converter::block::table::handle_table_with_context(
node_handle,
parser,
output,
options,
ctx,
dom_ctx,
depth,
),
"ul" | "ol" | "li" | "dl" | "dt" | "dd" => {
crate::converter::list::dispatch_list_handler(
&tag_name,
node_handle,
tag,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
crate::converter::semantic::dispatch_semantic_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"q" => {
crate::converter::semantic::dispatch_semantic_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"figure" | "figcaption" => {
crate::converter::semantic::dispatch_semantic_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"details" | "summary" | "dialog" | "menu" => {
crate::converter::semantic::dispatch_semantic_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"audio" | "video" | "picture" | "iframe" | "svg" | "math" => {
crate::converter::media::dispatch_media_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"form" | "fieldset" | "legend" | "label" | "input" | "textarea" | "select"
| "option" | "optgroup" | "button" | "progress" | "meter" | "output"
| "datalist" => {
crate::converter::form::dispatch_form_handler(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"head" | "script" | "style" => {
crate::converter::metadata::handle(
&tag_name,
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
"body" | "html" => {
crate::converter::block::container::handle_structural_container(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
_ => {
crate::converter::block::unknown::handle(
node_handle,
parser,
output,
options,
ctx,
depth,
dom_ctx,
);
}
}
#[cfg(feature = "visitor")]
if let Some(ref visitor_handle) = ctx.visitor {
use crate::converter::visitor_hooks::handle_visitor_element_end;
handle_visitor_element_end(
visitor_handle,
tag_name.as_ref(),
node_handle,
tag,
parser,
output,
element_output_start,
ctx,
depth,
dom_ctx,
);
}
}
tl::Node::Comment(_) => {}
}
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use super::{PreprocessedHtml, preprocess_for_tier2};
use crate::options::ConversionOptions;
#[test]
fn clean_preprocessing_path_stays_borrowed() {
let html = "<main><h1>Hello</h1><p>World</p></main>";
let preprocessed = PreprocessedHtml::new(html);
assert!(matches!(preprocessed.html, Cow::Borrowed(_)));
assert_eq!(preprocessed.as_str(), html);
}
#[test]
fn rewriting_preprocessing_paths_become_owned() {
let html = "<p>Before</p><script>ignored()</script><p>After</p>";
let preprocessed = preprocess_for_tier2(Cow::Borrowed(html));
assert!(matches!(preprocessed, Cow::Owned(_)));
assert!(!preprocessed.contains("ignored()"));
assert!(preprocessed.contains("<p>Before</p>"));
assert!(preprocessed.contains("<p>After</p>"));
}
#[test]
fn split_closing_tag_rewrite_still_converts() {
let html = "<p><a href=\"https://example.com\">Example</a\n></p>";
let result = super::convert_html_impl(
html,
&ConversionOptions::default(),
None,
#[cfg(feature = "metadata")]
None,
None,
None,
)
.expect("split closing tag should be normalized before parsing");
assert!(result.0.contains("[Example](https://example.com)"));
}
}