#[cfg(feature = "visitor")]
use crate::converter::utility::content::collect_tag_attributes;
use crate::converter::utility::content::{
collect_link_label_text, escape_link_label, normalize_link_label,
};
use crate::converter::utility::preprocessing::sanitize_markdown_url;
use crate::options::ConversionOptions;
use crate::tl_types::Parser;
#[cfg(any(feature = "metadata", feature = "visitor"))]
use std::collections::BTreeMap;
use tl::NodeHandle;
type Context = crate::converter::Context;
type DomContext = crate::converter::DomContext;
pub fn handle(
node_handle: &NodeHandle,
parser: &Parser,
output: &mut String,
options: &ConversionOptions,
ctx: &Context,
depth: usize,
dom_ctx: &DomContext,
) {
use crate::converter::block::heading::{heading_allows_inline_images, push_heading};
use crate::converter::utility::content::normalized_tag_name;
#[allow(unused_imports)]
use crate::converter::utility::serialization::serialize_node;
use crate::converter::{find_single_heading_child, get_text_content, walk_node};
let Some(node) = node_handle.get(parser) else {
return;
};
let tl::Node::Tag(tag) = node else {
return;
};
let href_attr = tag.attributes().get("href").flatten().map(|v| {
let decoded = crate::text::decode_html_entities(&v.as_utf8_str());
sanitize_markdown_url(&decoded).into_owned()
});
let title = tag
.attributes()
.get("title")
.flatten()
.map(|v| v.as_utf8_str().to_string());
if let Some(href) = href_attr {
let raw_text =
crate::text::normalize_whitespace(&get_text_content(node_handle, parser, dom_ctx))
.trim()
.to_string();
if ctx.in_link {
let children = tag.children();
for child_handle in children.top().iter() {
walk_node(
child_handle,
parser,
output,
options,
ctx,
depth + 1,
dom_ctx,
);
}
return;
}
let is_autolink = options.autolinks
&& !options.default_title
&& !href.is_empty()
&& has_uri_scheme(href.as_str())
&& (raw_text == href || (href.starts_with("mailto:") && raw_text == href[7..]));
if is_autolink {
output.push('<');
if href.starts_with("mailto:") && raw_text == href[7..] {
output.push_str(&raw_text);
} else {
output.push_str(&href);
}
output.push('>');
return;
}
if let Some((heading_level, heading_handle)) =
find_single_heading_child(*node_handle, parser)
&& let Some(heading_node) = heading_handle.get(parser)
&& let tl::Node::Tag(heading_tag) = heading_node
{
let heading_name = normalized_tag_name(heading_tag.name().as_utf8_str()).into_owned();
let mut heading_text = String::new();
let heading_ctx = Context {
in_heading: true,
convert_as_inline: true,
heading_allow_inline_images: heading_allows_inline_images(
&heading_name,
&ctx.keep_inline_images_in,
),
..ctx.clone()
};
walk_node(
&heading_handle,
parser,
&mut heading_text,
options,
&heading_ctx,
depth + 1,
dom_ctx,
);
let trimmed_heading = heading_text.trim();
if !trimmed_heading.is_empty() {
let escaped_label = escape_link_label(trimmed_heading);
let mut link_buffer = String::new();
append_markdown_link(
&mut link_buffer,
&escaped_label,
href.as_str(),
title.as_deref(),
raw_text.as_str(),
options,
ctx.reference_collector.as_ref(),
);
push_heading(output, ctx, options, heading_level, link_buffer.as_str());
return;
}
}
let children: Vec<_> = tag.children().top().iter().copied().collect();
let (inline_label, _block_nodes, saw_block) =
collect_link_label_text(&children, parser, dom_ctx);
let mut label = if saw_block {
let mut content = String::new();
let link_ctx = Context {
inline_depth: ctx.inline_depth + 1,
convert_as_inline: true,
in_link: true,
..ctx.clone()
};
for child_handle in &children {
let mut child_buf = String::new();
walk_node(
child_handle,
parser,
&mut child_buf,
options,
&link_ctx,
depth + 1,
dom_ctx,
);
if !child_buf.trim().is_empty()
&& !content.is_empty()
&& !content.chars().last().is_none_or(char::is_whitespace)
&& !child_buf.chars().next().is_none_or(char::is_whitespace)
{
content.push(' ');
}
content.push_str(&child_buf);
}
if content.trim().is_empty() {
normalize_link_label(&inline_label)
} else {
normalize_link_label(&content)
}
} else {
let mut content = String::new();
let link_ctx = Context {
inline_depth: ctx.inline_depth + 1,
in_link: true,
..ctx.clone()
};
for child_handle in &children {
walk_node(
child_handle,
parser,
&mut content,
options,
&link_ctx,
depth + 1,
dom_ctx,
);
}
normalize_link_label(&content)
};
if label.is_empty() && saw_block {
let fallback =
crate::text::normalize_whitespace(&get_text_content(node_handle, parser, dom_ctx));
label = normalize_link_label(&fallback);
}
if label.is_empty() && !raw_text.is_empty() {
label = normalize_link_label(&raw_text);
}
if label.is_empty() && !href.is_empty() && !children.is_empty() {
label = href.clone();
}
let escaped_label = escape_link_label(&label);
#[cfg(feature = "visitor")]
let link_output = if let Some(ref visitor_handle) = ctx.visitor {
use crate::visitor::{NodeContext, NodeType, VisitResult};
let attributes = collect_tag_attributes(tag);
let node_id = node_handle.get_inner();
let parent_tag = dom_ctx.parent_tag_name(node_id, parser);
let index_in_parent = dom_ctx.get_sibling_index(node_id).unwrap_or(0);
let node_ctx = NodeContext {
node_type: NodeType::Link,
tag_name: "a".into(),
attributes,
depth,
index_in_parent,
parent_tag,
is_inline: true,
};
let visit_result = {
let mut visitor = visitor_handle.lock().expect("visitor mutex poisoned");
visitor.visit_link(&node_ctx, &href, &label, title.as_deref())
};
match visit_result {
VisitResult::Continue => {
let mut buf = String::new();
append_markdown_link(
&mut buf,
&escaped_label,
href.as_str(),
title.as_deref(),
label.as_str(),
options,
ctx.reference_collector.as_ref(),
);
Some(buf)
}
VisitResult::Custom(custom) => Some(custom),
VisitResult::Skip => None,
VisitResult::Error(err) => {
if ctx.visitor_error.borrow().is_none() {
*ctx.visitor_error.borrow_mut() = Some(err);
}
None
}
VisitResult::PreserveHtml => Some(serialize_node(node_handle, parser)),
}
} else {
let mut buf = String::new();
append_markdown_link(
&mut buf,
&escaped_label,
href.as_str(),
title.as_deref(),
label.as_str(),
options,
ctx.reference_collector.as_ref(),
);
Some(buf)
};
#[cfg(not(feature = "visitor"))]
let link_output = {
let mut buf = String::new();
append_markdown_link(
&mut buf,
&escaped_label,
href.as_str(),
title.as_deref(),
label.as_str(),
options,
ctx.reference_collector.as_ref(),
);
Some(buf)
};
if let Some(link_text) = link_output {
output.push_str(&link_text);
}
#[cfg(feature = "metadata")]
if ctx.metadata_wants_links
&& let Some(ref collector) = ctx.metadata_collector
{
let rel_attr = tag
.attributes()
.get("rel")
.flatten()
.map(|v| v.as_utf8_str().to_string());
let mut attributes_map = BTreeMap::new();
for (key, value_opt) in tag.attributes().iter() {
let key_str = key.to_string();
if key_str == "href" {
continue;
}
let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
attributes_map.insert(key_str, value);
}
collector.borrow_mut().add_link(
href.clone(),
label,
title.clone(),
rel_attr,
attributes_map,
);
}
} else {
let children = tag.children();
for child_handle in children.top().iter() {
walk_node(
child_handle,
parser,
output,
options,
ctx,
depth + 1,
dom_ctx,
);
}
}
}
#[must_use]
pub fn has_uri_scheme(href: &str) -> bool {
let mut bytes = href.bytes();
match bytes.next() {
Some(b) if b.is_ascii_alphabetic() => {}
_ => return false,
}
for b in bytes {
match b {
b':' => return true,
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'+' | b'-' | b'.' => {}
_ => return false,
}
}
false
}
#[must_use]
pub fn percent_encode_url(url: &str) -> String {
let mut encoded = String::with_capacity(url.len() * 2);
for byte in url.bytes() {
match byte {
b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' | b'/' => {
encoded.push(byte as char);
}
other => {
encoded.push('%');
let hi = char::from_digit(u32::from(other >> 4), 16)
.unwrap_or('0')
.to_ascii_uppercase();
let lo = char::from_digit(u32::from(other & 0x0f), 16)
.unwrap_or('0')
.to_ascii_uppercase();
encoded.push(hi);
encoded.push(lo);
}
}
}
encoded
}
pub fn append_markdown_link(
output: &mut String,
label: &str,
href: &str,
title: Option<&str>,
raw_text: &str,
options: &ConversionOptions,
reference_collector: Option<&crate::converter::reference_collector::ReferenceCollectorHandle>,
) {
if options.link_style == crate::options::validation::LinkStyle::Reference
&& !href.is_empty()
&& let Some(collector) = reference_collector
{
let ref_num = collector.borrow_mut().get_or_insert(href, title);
output.push('[');
output.push_str(label);
output.push_str("][");
output.push_str(&ref_num.to_string());
output.push(']');
return;
}
output.push('[');
output.push_str(label);
output.push_str("](");
if href.is_empty() {
output.push_str("<>");
} else if options.url_escape_style == crate::options::validation::UrlEscapeStyle::Percent {
let encoded = percent_encode_url(href);
output.push_str(&encoded);
} else if href.contains(' ') || href.contains('\n') {
output.push('<');
output.push_str(href);
output.push('>');
} else {
let open_count = href.chars().filter(|&c| c == '(').count();
let close_count = href.chars().filter(|&c| c == ')').count();
if open_count == close_count {
output.push_str(href);
} else {
let escaped_href = href.replace('(', "\\(").replace(')', "\\)");
output.push_str(&escaped_href);
}
}
if let Some(title_text) = title {
output.push_str(" \"");
if title_text.contains('"') {
let escaped_title = title_text.replace('"', "\\\"");
output.push_str(&escaped_title);
} else {
output.push_str(title_text);
}
output.push('"');
} else if options.default_title && raw_text == href {
output.push_str(" \"");
if href.contains('"') {
let escaped_href = href.replace('"', "\\\"");
output.push_str(&escaped_href);
} else {
output.push_str(href);
}
output.push('"');
}
output.push(')');
}
#[cfg(test)]
mod tests {
use super::*;
use crate::options::validation::UrlEscapeStyle;
fn opts_with_style(style: UrlEscapeStyle) -> ConversionOptions {
ConversionOptions::builder().url_escape_style(style).build()
}
#[test]
fn has_uri_scheme_accepts_http() {
assert!(has_uri_scheme("http://example.com"));
assert!(has_uri_scheme("https://example.com/path"));
}
#[test]
fn has_uri_scheme_accepts_mailto() {
assert!(has_uri_scheme("mailto:a@b.com"));
}
#[test]
fn has_uri_scheme_accepts_uncommon_schemes() {
assert!(has_uri_scheme("ftp://host"));
assert!(has_uri_scheme("ssh://host"));
assert!(has_uri_scheme("data:text/plain,foo"));
assert!(has_uri_scheme("file:///etc/hosts"));
}
#[test]
fn has_uri_scheme_rejects_bare_paths() {
assert!(!has_uri_scheme("foobar.png"));
assert!(!has_uri_scheme("/relative/path"));
assert!(!has_uri_scheme("../up.html"));
assert!(!has_uri_scheme("#fragment"));
}
#[test]
fn has_uri_scheme_rejects_leading_digit_or_punct() {
assert!(!has_uri_scheme("9scheme:foo"));
assert!(!has_uri_scheme(":no-scheme"));
assert!(!has_uri_scheme(""));
}
#[test]
fn issue_397_filename_with_extension_is_not_autolinked() {
assert!(!has_uri_scheme("foobar.png"));
}
#[test]
fn percent_encode_url_leaves_unreserved_chars_unchanged() {
let result = percent_encode_url("/path-to_file.html~");
assert_eq!(result, "/path-to_file.html~");
}
#[test]
fn percent_encode_url_encodes_spaces() {
assert_eq!(percent_encode_url("/file (1).pdf"), "/file%20%281%29.pdf");
}
#[test]
fn percent_encode_url_encodes_angle_brackets() {
assert_eq!(
percent_encode_url("/file <draft>.pdf"),
"/file%20%3Cdraft%3E.pdf"
);
}
#[test]
fn percent_encode_url_full_issue_example() {
assert_eq!(
percent_encode_url("/file (1) <draft>.pdf"),
"/file%20%281%29%20%3Cdraft%3E.pdf"
);
}
#[test]
fn append_markdown_link_angle_plain_url_unchanged() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Angle);
append_markdown_link(&mut out, "text", "/file.pdf", None, "text", &options, None);
assert_eq!(out, "[text](/file.pdf)");
}
#[test]
fn append_markdown_link_angle_wraps_space_in_angle_brackets() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Angle);
append_markdown_link(
&mut out,
"file",
"/file (1).pdf",
None,
"file",
&options,
None,
);
assert_eq!(out, "[file](</file (1).pdf>)");
}
#[test]
fn append_markdown_link_percent_encodes_spaces() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Percent);
append_markdown_link(
&mut out,
"file",
"/file (1).pdf",
None,
"file",
&options,
None,
);
assert_eq!(out, "[file](/file%20%281%29.pdf)");
}
#[test]
fn append_markdown_link_percent_encodes_angle_brackets() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Percent);
append_markdown_link(
&mut out,
"file",
"/file <draft>.pdf",
None,
"file",
&options,
None,
);
assert_eq!(out, "[file](/file%20%3Cdraft%3E.pdf)");
}
#[test]
fn append_markdown_link_percent_full_issue_example() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Percent);
append_markdown_link(
&mut out,
"file",
"/file (1) <draft>.pdf",
None,
"file",
&options,
None,
);
assert_eq!(out, "[file](/file%20%281%29%20%3Cdraft%3E.pdf)");
}
#[test]
fn append_markdown_link_percent_preserves_title() {
let mut out = String::new();
let options = opts_with_style(UrlEscapeStyle::Percent);
append_markdown_link(
&mut out,
"link",
"/path with spaces",
Some("My Title"),
"link",
&options,
None,
);
assert_eq!(out, "[link](/path%20with%20spaces \"My Title\")");
}
}