use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use ego_tree::NodeId;
use scraper::node::Node;
use scraper::{ElementRef, Html};
use crate::context::Context;
use crate::escape;
use crate::options::Options;
use crate::whitespace;
#[derive(Debug)]
#[non_exhaustive]
pub enum Action {
Replace(String),
Skip,
Remove,
}
pub trait Rule: Send + Sync {
fn tags(&self) -> &'static [&'static str];
fn apply(&self, content: &str, element: &ElementRef<'_>, ctx: &mut Context<'_>) -> Action;
}
pub trait Plugin {
fn register(&self, builder: &mut ConverterBuilder);
}
#[derive(Default)]
pub struct ConverterBuilder {
options: Options,
rules: HashMap<&'static str, Vec<Arc<dyn Rule>>>,
keep_tags: HashSet<String>,
remove_tags: HashSet<String>,
domain: Option<String>,
}
impl std::fmt::Debug for ConverterBuilder {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ConverterBuilder")
.field("options", &self.options)
.field(
"rule_count",
&self.rules.values().map(Vec::len).sum::<usize>(),
)
.field("keep_tags", &self.keep_tags)
.field("remove_tags", &self.remove_tags)
.field("domain", &self.domain)
.finish()
}
}
impl ConverterBuilder {
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[must_use]
pub const fn options(mut self, opts: Options) -> Self {
self.options = opts;
self
}
pub fn add_rule(&mut self, rule: impl Rule + 'static) {
let arc: Arc<dyn Rule> = Arc::new(rule);
for &tag in arc.tags() {
self.rules.entry(tag).or_default().push(Arc::clone(&arc));
}
}
#[must_use]
pub fn use_plugin(mut self, plugin: &impl Plugin) -> Self {
plugin.register(&mut self);
self
}
#[must_use]
pub fn keep(mut self, tags: &[&str]) -> Self {
self.keep_tags.extend(tags.iter().map(|&s| s.to_owned()));
self
}
#[must_use]
pub fn remove(mut self, tags: &[&str]) -> Self {
self.remove_tags.extend(tags.iter().map(|&s| s.to_owned()));
self
}
#[must_use]
pub fn domain(mut self, domain: impl Into<String>) -> Self {
self.domain = Some(domain.into());
self
}
#[must_use]
pub fn build(self) -> Converter {
Converter {
options: self.options,
rules: self.rules,
keep_tags: self.keep_tags,
remove_tags: self.remove_tags,
domain: self.domain,
}
}
}
#[derive(Clone)]
pub struct Converter {
options: Options,
rules: HashMap<&'static str, Vec<Arc<dyn Rule>>>,
keep_tags: HashSet<String>,
remove_tags: HashSet<String>,
domain: Option<String>,
}
impl std::fmt::Debug for Converter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Converter")
.field("options", &self.options)
.field(
"rule_count",
&self.rules.values().map(Vec::len).sum::<usize>(),
)
.field("keep_tags", &self.keep_tags)
.field("remove_tags", &self.remove_tags)
.field("domain", &self.domain)
.finish()
}
}
const _: () = {
const fn assert_send_sync<T: Send + Sync>() {}
assert_send_sync::<Converter>();
};
impl Converter {
#[must_use]
pub fn builder() -> ConverterBuilder {
ConverterBuilder::new()
}
#[must_use]
pub fn convert(&self, html: &str) -> String {
let document = Html::parse_document(html);
let mut ctx = Context::new(self.options, self.domain.as_deref());
ctx.annotate_lists(document.root_element().id(), &document);
let root_id = document.root_element().id();
let mut output = String::with_capacity(html.len());
self.write_node(root_id, &document, &mut ctx, &mut output);
if ctx.has_references() {
output.push_str("\n\n");
output.push_str(&ctx.take_references());
}
whitespace::clean_output(&output)
}
pub fn convert_reader(&self, mut reader: impl std::io::Read) -> crate::Result<String> {
let mut html = String::new();
reader.read_to_string(&mut html)?;
Ok(self.convert(&html))
}
fn write_node(
&self,
node_id: NodeId,
document: &Html,
ctx: &mut Context<'_>,
buf: &mut String,
) {
let Some(node_ref) = document.tree.get(node_id) else {
return;
};
match node_ref.value() {
Node::Text(text) => Self::write_text(text, ctx, buf),
Node::Element(_) => {
if let Some(element_ref) = ElementRef::wrap(node_ref) {
self.write_element(&element_ref, document, ctx, buf);
}
}
Node::Document => {
for child in node_ref.children() {
self.write_node(child.id(), document, ctx, buf);
}
}
_ => {}
}
}
fn write_text(text: &scraper::node::Text, ctx: &Context<'_>, buf: &mut String) {
let raw: &str = text;
if ctx.in_pre() {
buf.push_str(raw);
return;
}
let collapsed = whitespace::collapse_whitespace(raw);
let escaped = escape::escape_markdown(&collapsed, ctx.options().escape_mode());
buf.push_str(&escaped);
}
fn write_element(
&self,
element: &ElementRef<'_>,
document: &Html,
ctx: &mut Context<'_>,
buf: &mut String,
) {
let tag = element.value().name();
if self.remove_tags.contains(tag) || matches!(tag, "script" | "style" | "noscript" | "head")
{
return;
}
let was_in_pre = ctx.in_pre();
if matches!(tag, "pre" | "code" | "kbd" | "samp" | "tt") {
ctx.set_in_pre(true);
}
let child_start = buf.len();
let Some(node_ref) = document.tree.get(element.id()) else {
return;
};
for child in node_ref.children() {
self.write_node(child.id(), document, ctx, buf);
}
ctx.set_in_pre(was_in_pre);
if self.keep_tags.contains(tag) {
let kept = element.html();
buf.truncate(child_start);
buf.push_str(&kept);
return;
}
if let Some(rules) = self.rules.get(tag) {
let content = buf[child_start..].to_owned();
let action =
rules
.iter()
.rev()
.find_map(|rule| match rule.apply(&content, element, ctx) {
Action::Skip => None,
other => Some(other),
});
match action {
Some(Action::Replace(md)) => {
buf.truncate(child_start);
buf.push_str(&md);
}
Some(Action::Remove) => {
buf.truncate(child_start);
}
_ => {}
}
}
}
}