use std::borrow::Cow;
use memchr::{memchr, memchr3, memmem};
#[cfg(any(feature = "metadata", feature = "inline-images"))]
use crate::ConversionError;
use crate::error::Result;
use crate::options::{ConversionOptions, WhitespaceMode};
use crate::text;
use crate::types::ConversionResult;
use crate::validation::{Utf16Encoding, detect_utf16_encoding, validate_input};
#[cfg(feature = "metadata")]
use crate::{HtmlMetadata, MetadataConfig};
pub fn convert(
html: &str,
options: impl Into<Option<ConversionOptions>>,
) -> Result<ConversionResult> {
convert_inner(html, options.into().unwrap_or_default())
}
fn convert_inner(html: &str, options: ConversionOptions) -> Result<ConversionResult> {
#[cfg(any(feature = "metadata", feature = "inline-images"))]
use std::cell::RefCell;
#[cfg(any(feature = "metadata", feature = "inline-images"))]
use std::rc::Rc;
if !options.wrap && can_fast_text_only_before_normalize(html) {
validate_input(html)?;
let markdown = fast_text_only_unchecked(html, &options, false);
return Ok(conversion_result_from_content(markdown));
}
let mut precomputed_normalized: Option<Cow<'_, str>> = None;
match options.tier_strategy {
crate::options::TierStrategy::FastDom => {
let normalized = normalize_input_for_fast_dom(html)?;
let markdown = crate::converter::fast_dom::convert(normalized.as_ref(), &options)?;
return Ok(conversion_result_from_content(markdown));
}
crate::options::TierStrategy::Mdream => {
let normalized = normalize_input_for_fast_dom(html)?;
let markdown = crate::mdream_adapter::convert(normalized.as_ref(), &options);
return Ok(conversion_result_from_content(markdown));
}
crate::options::TierStrategy::Tier2 => {
}
crate::options::TierStrategy::Auto => {
if crate::converter::tier1::router::options_allow_tier1(&options) {
let normalized = normalize_input(html)?;
let (cleaned, report) = crate::converter::prescan::run(normalized.as_ref());
let decision = crate::converter::tier1::router::classify(&report, &options);
if decision == crate::converter::tier1::RouterDecision::Tier1 {
match crate::converter::tier1::run(cleaned.as_ref(), &report, &options) {
Ok(markdown) => {
return Ok(crate::types::ConversionResult {
content: Some(markdown),
document: None,
tables: Vec::new(),
warnings: Vec::new(),
#[cfg(feature = "metadata")]
metadata: crate::metadata::HtmlMetadata::default(),
#[cfg(feature = "inline-images")]
images: Vec::new(),
});
}
Err(_bail) => {
precomputed_normalized = Some(normalized);
}
}
} else {
precomputed_normalized = Some(normalized);
}
}
}
#[cfg(any(test, feature = "testkit"))]
crate::options::TierStrategy::Tier1 => {
let normalized = normalize_input(html)?;
let (cleaned, report) = crate::converter::prescan::run(normalized.as_ref());
match crate::converter::tier1::run(cleaned.as_ref(), &report, &options) {
Ok(markdown) => {
return Ok(crate::types::ConversionResult {
content: Some(markdown),
document: None,
tables: Vec::new(),
warnings: Vec::new(),
#[cfg(feature = "metadata")]
metadata: crate::metadata::HtmlMetadata::default(),
#[cfg(feature = "inline-images")]
images: Vec::new(),
});
}
Err(_bail) => {
precomputed_normalized = Some(normalized);
}
}
}
}
#[cfg(feature = "visitor")]
let visitor = options.visitor.clone();
let normalized_html = match precomputed_normalized {
Some(n) => n,
None => normalize_input(html)?,
};
if !options.wrap
&& let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options)
{
return Ok(conversion_result_from_content(markdown));
}
#[cfg(feature = "metadata")]
let wants_metadata = options.extract_metadata;
#[cfg(not(feature = "metadata"))]
let wants_metadata = false;
#[cfg(feature = "inline-images")]
let wants_images = options.extract_images;
#[cfg(not(feature = "inline-images"))]
let wants_images = false;
#[cfg(feature = "metadata")]
let metadata_collector = if wants_metadata {
Some(Rc::new(RefCell::new(
crate::metadata::MetadataCollector::new(MetadataConfig::default()),
)))
} else {
None
};
#[cfg(feature = "inline-images")]
let image_collector = if wants_images {
use crate::inline_images::{DEFAULT_INLINE_IMAGE_LIMIT, InlineImageConfig as IIC};
Some(Rc::new(RefCell::new(
crate::inline_images::InlineImageCollector::new(IIC::new(DEFAULT_INLINE_IMAGE_LIMIT))?,
)))
} else {
None
};
let structure_collector: Option<
std::rc::Rc<std::cell::RefCell<crate::types::StructureCollector>>,
> = if options.include_document_structure {
Some(std::rc::Rc::new(std::cell::RefCell::new(
crate::types::StructureCollector::new(),
)))
} else {
None
};
#[cfg(not(feature = "visitor"))]
let visitor: Option<()> = None;
let (markdown, document, tables) = {
#[cfg(all(feature = "metadata", feature = "inline-images"))]
{
crate::converter::convert_html_impl(
normalized_html.as_ref(),
&options,
image_collector.as_ref().map(Rc::clone),
metadata_collector.as_ref().map(Rc::clone),
visitor,
structure_collector,
)?
}
#[cfg(all(feature = "metadata", not(feature = "inline-images")))]
{
crate::converter::convert_html_impl(
normalized_html.as_ref(),
&options,
None,
metadata_collector.as_ref().map(Rc::clone),
visitor,
structure_collector,
)?
}
#[cfg(all(not(feature = "metadata"), feature = "inline-images"))]
{
crate::converter::convert_html_impl(
normalized_html.as_ref(),
&options,
image_collector.as_ref().map(Rc::clone),
None,
visitor,
structure_collector,
)?
}
#[cfg(all(not(feature = "metadata"), not(feature = "inline-images")))]
{
crate::converter::convert_html_impl(
normalized_html.as_ref(),
&options,
None,
None,
visitor,
structure_collector,
)?
}
};
let markdown = if options.wrap {
crate::wrapper::wrap_markdown(&markdown, &options)
} else {
markdown
};
#[cfg(feature = "metadata")]
let metadata = if let Some(collector) = metadata_collector {
Rc::try_unwrap(collector)
.map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
.into_inner()
.finish()
} else {
HtmlMetadata::default()
};
#[cfg(feature = "inline-images")]
let (images, image_warnings) = if let Some(collector) = image_collector {
let c = Rc::try_unwrap(collector)
.map_err(|_| {
ConversionError::Other("failed to recover inline image state".to_string())
})?
.into_inner();
c.finish()
} else {
(Vec::new(), Vec::new())
};
#[cfg(feature = "inline-images")]
let warnings: Vec<crate::types::ProcessingWarning> = image_warnings
.into_iter()
.map(|w| crate::types::ProcessingWarning {
kind: crate::types::WarningKind::ImageExtractionFailed,
message: w.message,
})
.collect();
#[cfg(not(feature = "inline-images"))]
let warnings: Vec<crate::types::ProcessingWarning> = Vec::new();
let _ = wants_metadata;
let _ = wants_images;
Ok(ConversionResult {
content: Some(markdown),
document,
#[cfg(feature = "metadata")]
metadata,
tables,
#[cfg(feature = "inline-images")]
images,
warnings,
})
}
fn conversion_result_from_content(markdown: String) -> ConversionResult {
ConversionResult {
content: Some(markdown),
..ConversionResult::default()
}
}
fn normalize_input(html: &str) -> Result<Cow<'_, str>> {
let decoded = decode_utf16_if_needed(html);
match decoded {
Cow::Borrowed(borrowed) => {
validate_input(borrowed)?;
let sanitized = strip_nul_bytes(borrowed);
let line_normalized = match sanitized {
Cow::Borrowed(b) => normalize_line_endings(b),
Cow::Owned(o) => Cow::Owned(normalize_line_endings(&o).into_owned()),
};
Ok(fix_xhtml_self_closing(line_normalized))
}
Cow::Owned(mut owned) => {
validate_input(&owned)?;
if owned.contains('\0') {
owned = owned.replace('\0', "");
}
if owned.contains('\r') {
owned = owned.replace("\r\n", "\n").replace('\r', "\n");
}
Ok(fix_xhtml_self_closing(Cow::Owned(owned)))
}
}
}
fn normalize_input_for_fast_dom(html: &str) -> Result<Cow<'_, str>> {
if !fast_dom_needs_full_normalize(html.as_bytes()) {
validate_input(html)?;
return Ok(Cow::Borrowed(html));
}
normalize_input(html)
}
#[cfg(all(feature = "simd", nightly))]
#[inline]
fn fast_dom_needs_full_normalize(bytes: &[u8]) -> bool {
crate::simd_scan::contains_any2(bytes, 0, b'\r')
}
#[cfg(not(all(feature = "simd", nightly)))]
#[inline]
fn fast_dom_needs_full_normalize(bytes: &[u8]) -> bool {
memchr::memchr2(0, b'\r', bytes).is_some()
}
fn fix_xhtml_self_closing(html: Cow<'_, str>) -> Cow<'_, str> {
if !bytes_contain_pair(html.as_ref().as_bytes(), b'/', b'>') {
return html;
}
let input = html.as_ref();
let bytes = input.as_bytes();
let mut output = String::new();
let mut last_copied = 0;
let mut index = 0;
while index + 3 <= bytes.len() {
if bytes[index] != b'<' || !bytes[index + 1].is_ascii_alphabetic() {
index += 1;
continue;
}
let mut end = index + 2;
while end < bytes.len() && is_html_tag_name_byte(bytes[end]) {
end += 1;
}
if end + 1 < bytes.len() && bytes[end] == b'/' && bytes[end + 1] == b'>' {
if output.is_empty() {
output.reserve(input.len() + 4);
}
output.push_str(&input[last_copied..end]);
output.push_str(" />");
index = end + 2;
last_copied = index;
} else {
index += 1;
}
}
if output.is_empty() {
html
} else {
output.push_str(&input[last_copied..]);
Cow::Owned(output)
}
}
const fn is_html_tag_name_byte(byte: u8) -> bool {
matches!(
byte,
b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b':' | b'.' | b'-'
)
}
fn decode_utf16_if_needed(html: &str) -> Cow<'_, str> {
let bytes = html.as_bytes();
if !bytes_contain_byte(bytes, 0) {
return Cow::Borrowed(html);
}
let Some(encoding) = detect_utf16_encoding(bytes) else {
return Cow::Borrowed(html);
};
let decoded = decode_utf16_bytes(bytes, encoding);
if decoded.is_empty() {
Cow::Borrowed(html)
} else {
Cow::Owned(decoded)
}
}
fn decode_utf16_bytes(bytes: &[u8], encoding: Utf16Encoding) -> String {
let (is_little_endian, skip_bom) = match encoding {
Utf16Encoding::BomLe => (true, true),
Utf16Encoding::BomBe => (false, true),
Utf16Encoding::NoBomLe => (true, false),
Utf16Encoding::NoBomBe => (false, false),
};
let mut units = Vec::with_capacity(bytes.len() / 2);
for chunk in bytes.chunks_exact(2) {
let unit = if is_little_endian {
u16::from_le_bytes([chunk[0], chunk[1]])
} else {
u16::from_be_bytes([chunk[0], chunk[1]])
};
units.push(unit);
}
let mut decoded = String::from_utf16_lossy(&units);
if skip_bom {
decoded = decoded.trim_start_matches('\u{FEFF}').to_string();
}
decoded
}
fn strip_nul_bytes(html: &str) -> Cow<'_, str> {
if bytes_contain_byte(html.as_bytes(), 0) {
Cow::Owned(html.replace('\0', ""))
} else {
Cow::Borrowed(html)
}
}
fn normalize_line_endings(html: &str) -> Cow<'_, str> {
if bytes_contain_byte(html.as_bytes(), b'\r') {
Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
} else {
Cow::Borrowed(html)
}
}
fn can_fast_text_only_before_normalize(html: &str) -> bool {
!bytes_contain_any4(html.as_bytes(), b'<', b'&', b'\r', 0)
}
fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
if bytes_contain_byte(html.as_bytes(), b'<') {
return None;
}
Some(fast_text_only_unchecked(html, options, true))
}
fn fast_text_only_unchecked(
html: &str,
options: &ConversionOptions,
decode_entities: bool,
) -> String {
let mut decoded = if decode_entities {
text::decode_html_entities_cow(html)
} else {
Cow::Borrowed(html)
};
if options.strip_newlines && (decoded.contains('\n') || decoded.contains('\r')) {
decoded = Cow::Owned(decoded.replace(&['\r', '\n'][..], " "));
}
let trimmed = decoded.trim_end_matches('\n');
if trimmed.is_empty() {
return String::new();
}
let normalized = if options.whitespace_mode == WhitespaceMode::Normalized {
text::normalize_whitespace_cow(trimmed)
} else {
Cow::Borrowed(trimmed)
};
let escaped = if options.output_format == crate::options::OutputFormat::Plain {
normalized.into_owned()
} else if options.escape_misc
|| options.escape_asterisks
|| options.escape_underscores
|| options.escape_ascii
{
text::escape(
normalized.as_ref(),
options.escape_misc,
options.escape_asterisks,
options.escape_underscores,
options.escape_ascii,
)
.into_owned()
} else {
normalized.into_owned()
};
let mut output = String::with_capacity(escaped.len() + 1);
output.push_str(&escaped);
while output.ends_with(' ') || output.ends_with('\t') {
output.pop();
}
output.push('\n');
output
}
#[inline]
fn bytes_contain_byte(bytes: &[u8], needle: u8) -> bool {
memchr(needle, bytes).is_some()
}
#[inline]
fn bytes_contain_any4(bytes: &[u8], a: u8, b: u8, c: u8, d: u8) -> bool {
memchr3(a, b, c, bytes).is_some() || memchr(d, bytes).is_some()
}
#[inline]
fn bytes_contain_pair(bytes: &[u8], first: u8, second: u8) -> bool {
memmem::find(bytes, &[first, second]).is_some()
}