use crate::html2xml::convert_html_to_xml;
use aho_corasick::AhoCorasick;
use html2md;
use phf::phf_set;
use regex::Regex;
use serde::{Deserialize, Deserializer};
use spider::auto_encoder::is_binary_file;
use spider::lazy_static::lazy_static;
use spider::page::Page;
use spider::url::Url;
use spider::utils::clean_html;
lazy_static! {
static ref AHO: AhoCorasick = AhoCorasick::new(["\n\n\n", "\n \n ", "\n\n\n\n\n"]).unwrap();
static ref AHO_REPLACEMENTS: [&'static str; 3] = [
"\n\n", "\n\n", "\n\n", ];
static ref CLEAN_MARKDOWN_REGEX: Regex = {
Regex::new(
r"(?m)^[ \t]+|[ \t]+$|[ \t]+|\s*\n\s*\n\s*"
).unwrap()
};
static ref EXAMPLE_URL: Url = Url::parse("https://example.net").expect("invalid url");
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ReturnFormat {
#[default]
Raw,
Bytes,
Text,
Html2Text,
Screenshot,
Markdown,
CommonMark,
XML,
Empty,
}
impl ReturnFormat {
pub fn from_str(s: &str) -> ReturnFormat {
match s {
"text" | "Text" | "TEXT" => ReturnFormat::Text,
"html2text" | "Html2text" | "HTML2TEXT" | "html_2_text" | "HTML_2_TEXT" => {
ReturnFormat::Html2Text
}
"markdown" | "Markdown" | "MARKDOWN" => ReturnFormat::Markdown,
"raw" | "RAW" | "Raw" => ReturnFormat::Raw,
"bytes" | "Bytes" | "BYTES" => ReturnFormat::Bytes,
"commonmark" | "CommonMark" | "COMMONMARK" => ReturnFormat::CommonMark,
"xml" | "XML" | "XmL" | "Xml" => ReturnFormat::XML,
"screenshot" | "screenshots" | "SCREENSHOT" | "SCREENSHOTS" | "Screenshot"
| "Screenshots" => ReturnFormat::Screenshot,
"empty" | "Empty" | "EMPTY" => ReturnFormat::Empty,
_ => ReturnFormat::Raw,
}
}
}
impl<'de> Deserialize<'de> for ReturnFormat {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
match s.as_ref() {
"text" | "Text" | "TEXT" => Ok(ReturnFormat::Text),
"html2text" | "Html2text" | "HTML2TEXT" | "html_2_text" | "HTML_2_TEXT" => {
Ok(ReturnFormat::Html2Text)
}
"markdown" | "Markdown" | "MARKDOWN" => Ok(ReturnFormat::Markdown),
"raw" | "RAW" | "Raw" => Ok(ReturnFormat::Raw),
"bytes" | "Bytes" | "BYTES" => Ok(ReturnFormat::Bytes),
"commonmark" | "CommonMark" | "COMMONMARK" => Ok(ReturnFormat::CommonMark),
"xml" | "XML" | "XmL" | "Xml" => Ok(ReturnFormat::XML),
"empty" | "Empty" | "EMPTY" => Ok(ReturnFormat::Empty),
"screenshot" | "screenshots" | "SCREENSHOT" | "SCREENSHOTS" | "Screenshot"
| "Screenshots" => Ok(ReturnFormat::Screenshot),
_ => Ok(ReturnFormat::Raw),
}
}
}
#[derive(Debug, Default, Clone, Copy)]
pub struct TransformConfig {
pub readability: bool,
pub return_format: ReturnFormat,
pub filter_images: bool,
pub clean_html: bool,
pub filter_svg: bool,
pub main_content: bool,
}
#[derive(Debug, Default, Clone)]
pub struct SelectorConfiguration {
pub root_selector: Option<String>,
pub exclude_selector: Option<String>,
}
pub struct TransformInput<'a> {
pub url: Option<&'a url::Url>,
pub content: &'a [u8],
pub screenshot_bytes: Option<&'a [u8]>,
pub encoding: Option<&'a str>,
pub selector_config: Option<&'a SelectorConfiguration>,
pub ignore_tags: Option<&'a [&'a str]>,
}
static HTML_TAGS: phf::Set<&'static [u8]> = phf_set! {
b"<!doctype html",
b"<html",
b"<document",
};
pub static VALID_EXTENSIONS: phf::Set<&'static str> = phf_set! {
".html",
".htm",
".shtml",
".asp",
".aspx",
".php",
".jps",
".jpsx",
".jsp",
".cfm",
".xhtml",
".rhtml",
".phtml",
".erb",
};
pub fn is_html_content(bytes: &[u8], url: &Url) -> bool {
let check_bytes = if bytes.len() > 1024 {
&bytes[..1024]
} else {
bytes
};
for tag in HTML_TAGS.iter() {
if check_bytes
.windows(tag.len())
.any(|window| window.eq_ignore_ascii_case(tag))
{
return true;
}
}
if let Some(extension) = url
.path_segments()
.and_then(|segments| segments.last().and_then(|s| s.split('.').last()))
{
if VALID_EXTENSIONS.contains(extension) {
return true;
}
}
false
}
pub fn aho_clean_markdown(html: &str) -> String {
if html.len() <= 40 {
match AHO.try_replace_all(html, &*AHO_REPLACEMENTS) {
Ok(r) => r,
_ => html.into(),
}
} else {
let cleaned_html = CLEAN_MARKDOWN_REGEX.replace_all(html, |caps: ®ex::Captures| {
let matched = match caps.get(0) {
Some(m) => m.as_str(),
_ => Default::default(),
};
if matched.contains('\n') && matched.chars().filter(|&c| c == '\n').count() >= 3 {
"\n\n"
} else if matched.contains('\n') {
"\n"
} else {
" "
}
});
cleaned_html.into()
}
}
pub fn clean_html_elements(html: &str, tags: Vec<&str>) -> String {
use lol_html::{element, rewrite_str, RewriteStrSettings};
match rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: tags
.iter()
.map(|tag| {
element!(tag, |el| {
el.remove();
Ok(())
})
})
.collect::<Vec<_>>(),
..RewriteStrSettings::default()
},
) {
Ok(r) => r,
_ => html.into(),
}
}
pub(crate) fn build_static_vector(config: &TransformConfig) -> Vec<&'static str> {
let mut tags = Vec::new();
if config.filter_images {
tags.push("img");
tags.push("picture");
}
if config.filter_svg {
tags.push("svg");
}
if config.main_content {
tags.push("nav");
tags.push("header:first-of-type");
tags.push("footer");
tags.push("body > aside:not(:first-of-type)");
}
tags
}
fn build_ignore_set(ignore: &[String]) -> std::collections::HashSet<String> {
ignore.iter().map(|s| s.clone()).collect()
}
fn build_ignore_set_from_strs(ignore: &[&str]) -> std::collections::HashSet<String> {
ignore.iter().map(|&s| s.to_string()).collect()
}
pub fn transform_markdown(html: &str, commonmark: bool) -> String {
html2md::rewrite_html_custom_with_url(html, &None, commonmark, &None)
}
pub async fn transform_markdown_send(html: &str, commonmark: bool) -> String {
html2md::rewrite_html_custom_with_url_streaming(html, &None, commonmark, &None).await
}
pub fn transform_text(html: &str) -> String {
super::text_extract::extract_text(html, &Default::default())
}
pub fn transform_text_ignore(
html: &str,
custom_ignore: &Option<std::collections::HashSet<String>>,
) -> String {
super::text_extract::extract_text(html, custom_ignore)
}
fn get_html(res: &Page, encoding: &Option<String>) -> String {
match encoding {
Some(ref encoding) => res.get_html_encoded(encoding),
_ => res.get_html(),
}
}
#[inline]
fn get_html_bytes_safe(page: &Page) -> std::borrow::Cow<'_, [u8]> {
let mem = page.get_html_bytes_u8();
if !mem.is_empty() {
return std::borrow::Cow::Borrowed(mem);
}
#[cfg(feature = "balance")]
if page.is_html_on_disk() {
return std::borrow::Cow::Owned(page.get_html().into_bytes());
}
std::borrow::Cow::Borrowed(mem)
}
#[cfg(feature = "screenshot")]
fn get_screenshot(res: &Page) -> String {
use base64::{engine::general_purpose, Engine as _};
match &res.screenshot_bytes {
Some(content) => general_purpose::URL_SAFE.encode(&content),
_ => Default::default(),
}
}
#[cfg(not(feature = "screenshot"))]
fn get_screenshot(_res: &Page) -> String {
Default::default()
}
fn get_html_with_selector(
res: &Page,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
) -> String {
use scraper::{Html, Selector};
let html = get_html(res, encoding);
get_html_with_selector_impl(html, selector_config)
}
fn get_html_with_selector_impl(
html: String,
selector_config: &Option<SelectorConfiguration>,
) -> String {
use scraper::{Html, Selector};
if let Some(selector_config) = selector_config.as_ref() {
let mut fragment = Html::parse_fragment(&html);
if let Some(selector) = selector_config.root_selector.as_ref() {
if let Ok(parsed_selector) = Selector::parse(selector) {
if let Some(root_node) = fragment.select(&parsed_selector).next() {
if selector_config.exclude_selector.is_some() {
fragment.clone_from(&Html::parse_fragment(&root_node.html()));
} else {
return root_node.html();
}
}
}
}
if let Some(exclude_selector) = selector_config.exclude_selector.as_ref() {
if let Ok(exclude_sel) = Selector::parse(exclude_selector) {
let mut elements_to_remove = vec![];
for elem in fragment.root_element().select(&exclude_sel) {
elements_to_remove.push(elem.id());
}
for id in elements_to_remove {
fragment.remove_node(id);
}
}
}
return fragment.root_element().html();
}
html
}
#[inline]
fn get_html_with_selector_bytes(
content: &[u8],
encoding: Option<&str>,
selector_config: Option<&SelectorConfiguration>,
) -> String {
use scraper::{Html, Selector};
let html = match encoding {
Some(e) => auto_encoder::encode_bytes(content, e),
_ => auto_encoder::auto_encode_bytes(content),
};
let Some(cfg) = selector_config else {
return html;
};
if cfg.root_selector.is_none() && cfg.exclude_selector.is_none() {
return html;
}
let mut fragment = Html::parse_fragment(&html);
if let Some(selector) = cfg.root_selector.as_deref() {
if let Ok(parsed_selector) = Selector::parse(selector) {
if let Some(root_node) = fragment.select(&parsed_selector).next() {
if cfg.exclude_selector.is_some() {
fragment = Html::parse_fragment(&root_node.html());
} else {
return root_node.html();
}
}
}
}
if let Some(exclude_selector) = cfg.exclude_selector.as_deref() {
if let Ok(exclude_sel) = Selector::parse(exclude_selector) {
let mut ids = Vec::with_capacity(32);
for elem in fragment.root_element().select(&exclude_sel) {
ids.push(elem.id());
}
for id in ids {
fragment.remove_node(id);
}
}
}
fragment.root_element().html()
}
async fn get_html_with_selector_bytes_async(
content: &[u8],
encoding: Option<&str>,
selector_config: Option<&SelectorConfiguration>,
) -> String {
let html = match encoding {
Some(e) => auto_encoder::encode_bytes(content, e),
_ => auto_encoder::auto_encode_bytes(content),
};
let Some(cfg) = selector_config else {
return html;
};
if cfg.root_selector.is_none() && cfg.exclude_selector.is_none() {
return html;
}
const CHUNK_SIZE: usize = 8192;
let after_root: Vec<u8> = if let Some(root_sel) = cfg.root_selector.as_deref() {
match extract_root_subtree_async(html.as_bytes(), root_sel, CHUNK_SIZE).await {
Some(b) => b,
None => html.into_bytes(),
}
} else {
html.into_bytes()
};
let final_bytes: Vec<u8> = if let Some(excl) = cfg.exclude_selector.as_deref() {
match remove_excludes_async(&after_root, excl, CHUNK_SIZE).await {
Some(b) => b,
None => after_root,
}
} else {
after_root
};
match String::from_utf8(final_bytes) {
Ok(s) => s,
Err(e) => String::from_utf8_lossy(e.as_bytes()).into_owned(),
}
}
async fn extract_root_subtree_async(
html: &[u8],
selector: &str,
chunk_size: usize,
) -> Option<Vec<u8>> {
use lol_html::{element, html_content::ContentType};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex};
const OPEN: &str = "<!--__SPIDER_SEL_ROOT_OPEN__-->";
const CLOSE: &str = "<!--__SPIDER_SEL_ROOT_CLOSE__-->";
let collected: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::with_capacity(html.len())));
let sink = collected.clone();
let matched = Arc::new(AtomicBool::new(false));
let matched_el = matched.clone();
let element_content_handlers = vec![element!(selector, move |el| {
if !matched_el.swap(true, Ordering::SeqCst) {
el.before(OPEN, ContentType::Html);
if let Some(handlers) = el.end_tag_handlers() {
let h: lol_html::send::EndTagHandler<'static> = Box::new(|end| {
end.after(CLOSE, ContentType::Html);
Ok(())
});
handlers.push(h);
}
}
Ok(())
})];
let settings = lol_html::send::RewriteStrSettings {
element_content_handlers,
..lol_html::send::RewriteStrSettings::new_send()
};
{
let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), move |c: &[u8]| {
if let Ok(mut g) = sink.lock() {
g.extend_from_slice(c);
}
});
let mut wrote_error = false;
for chunk in html.chunks(chunk_size) {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
spider::tokio::task::yield_now().await;
}
if wrote_error {
return None;
}
if rewriter.end().is_err() {
return None;
}
}
if !matched.load(Ordering::SeqCst) {
return None;
}
let buf = Arc::try_unwrap(collected).ok()?.into_inner().ok()?;
let bytes = buf.as_slice();
let open_at = find_subseq(bytes, OPEN.as_bytes())?;
let after_open = open_at + OPEN.len();
let close_at = find_subseq(&bytes[after_open..], CLOSE.as_bytes())? + after_open;
if after_open > close_at {
return None;
}
Some(bytes[after_open..close_at].to_vec())
}
async fn remove_excludes_async(
html: &[u8],
selector: &str,
chunk_size: usize,
) -> Option<Vec<u8>> {
use lol_html::element;
use std::sync::{Arc, Mutex};
let collected: Arc<Mutex<Vec<u8>>> = Arc::new(Mutex::new(Vec::with_capacity(html.len())));
let sink = collected.clone();
let element_content_handlers = vec![element!(selector, |el| {
el.remove();
Ok(())
})];
let settings = lol_html::send::RewriteStrSettings {
element_content_handlers,
..lol_html::send::RewriteStrSettings::new_send()
};
{
let mut rewriter = lol_html::send::HtmlRewriter::new(settings.into(), move |c: &[u8]| {
if let Ok(mut g) = sink.lock() {
g.extend_from_slice(c);
}
});
let mut wrote_error = false;
for chunk in html.chunks(chunk_size) {
if rewriter.write(chunk).is_err() {
wrote_error = true;
break;
}
spider::tokio::task::yield_now().await;
}
if wrote_error {
return None;
}
if rewriter.end().is_err() {
return None;
}
}
Arc::try_unwrap(collected).ok()?.into_inner().ok()
}
#[inline]
fn find_subseq(haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.is_empty() || needle.len() > haystack.len() {
return None;
}
haystack.windows(needle.len()).position(|w| w == needle)
}
pub fn transform_content(
res: &Page,
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> String {
let base_html = get_html_with_selector(res, encoding, selector_config);
let html_bytes = get_html_bytes_safe(res);
if is_binary_file(&*html_bytes) {
#[cfg(feature = "document")]
{
if let Some(md) = crate::transformation::document::try_convert_document(&*html_bytes) {
return md;
}
}
#[cfg(feature = "audio")]
{
if let Some(md) = crate::transformation::audio::try_convert_audio(&*html_bytes) {
return md;
}
}
return base_html;
}
drop(html_bytes);
let url_parsed = res.get_url_parsed_ref();
let base_html = {
let mut ignore_list = build_static_vector(c);
if let Some(ignore) = ignore_tags {
ignore_list.extend(ignore.iter().map(|s| s.as_str()));
}
if ignore_list.is_empty() {
base_html
} else {
clean_html_elements(&base_html, ignore_list)
}
};
let base_html = if c.readability {
match llm_readability::extractor::extract(
&mut base_html.as_bytes(),
match url_parsed {
Some(u) => u,
_ => &EXAMPLE_URL,
},
) {
Ok(product) => product.content,
_ => base_html,
}
} else {
base_html
};
let base_html = if c.clean_html {
clean_html(&base_html)
} else {
base_html
};
let tag_factory = ignore_tags.as_ref().map(|v| build_ignore_set(v));
match c.return_format {
ReturnFormat::Empty => Default::default(),
ReturnFormat::Screenshot => get_screenshot(&res),
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => {
html2md::rewrite_html_custom_with_url(&base_html, &tag_factory, true, url_parsed)
}
ReturnFormat::Markdown => {
html2md::rewrite_html_custom_with_url(&base_html, &tag_factory, false, url_parsed)
}
ReturnFormat::Html2Text => {
if !base_html.is_empty() {
crate::html2text::from_read(base_html.as_bytes(), base_html.len())
} else {
base_html
}
}
ReturnFormat::Text => super::text_extract::extract_text(&base_html, &tag_factory),
ReturnFormat::XML => convert_html_to_xml(
base_html.trim(),
url_parsed
.as_ref()
.map(|u| u.as_str())
.unwrap_or(EXAMPLE_URL.as_str()),
encoding,
)
.unwrap_or_default(),
}
}
pub async fn transform_content_send(
res: &Page,
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> String {
let ignore_strs: Option<Vec<&str>> = ignore_tags
.as_ref()
.map(|v| v.iter().map(|s| s.as_str()).collect());
if res.binary_file {
let bytes: std::borrow::Cow<'_, [u8]> = if res.is_html_on_disk() {
std::borrow::Cow::Owned(res.get_html_async().await.into_bytes())
} else {
get_html_bytes_safe(res)
};
#[cfg(feature = "document")]
if let Some(md) = crate::transformation::document::try_convert_document(&bytes) {
return md;
}
#[cfg(feature = "audio")]
if let Some(md) = crate::transformation::audio::try_convert_audio(&bytes) {
return md;
}
return String::new();
}
if !res.is_html_on_disk() {
let bytes = get_html_bytes_safe(res);
let input = TransformInput {
url: res.get_url_parsed_ref().as_ref(),
content: &bytes,
screenshot_bytes: {
#[cfg(feature = "screenshot")]
{
res.screenshot_bytes.as_deref()
}
#[cfg(not(feature = "screenshot"))]
{
None
}
},
encoding: encoding.as_deref(),
selector_config: selector_config.as_ref(),
ignore_tags: ignore_strs.as_deref(),
};
return transform_content_send_from_url_and_bytes(input, c).await;
}
let html = res.get_html_async().await;
let input = TransformInput {
url: res.get_url_parsed_ref().as_ref(),
content: html.as_bytes(),
screenshot_bytes: {
#[cfg(feature = "screenshot")]
{
res.screenshot_bytes.as_deref()
}
#[cfg(not(feature = "screenshot"))]
{
None
}
},
encoding: encoding.as_deref(),
selector_config: selector_config.as_ref(),
ignore_tags: ignore_strs.as_deref(),
};
transform_content_send_from_url_and_bytes(input, c).await
}
pub async fn transform_content_send_from_url_and_bytes(
input: TransformInput<'_>,
c: &TransformConfig,
) -> String {
use std::collections::HashSet;
let base_html =
get_html_with_selector_bytes_async(input.content, input.encoding, input.selector_config)
.await;
if is_binary_file(input.content) {
#[cfg(feature = "document")]
{
if let Some(md) = crate::transformation::document::try_convert_document(input.content) {
return md;
}
}
#[cfg(feature = "audio")]
{
if let Some(md) = crate::transformation::audio::try_convert_audio(input.content) {
return md;
}
}
return base_html;
}
let base_html = {
let mut ignore_list = build_static_vector(c);
if let Some(ignore) = input.ignore_tags {
ignore_list.extend(ignore.iter().copied());
}
if ignore_list.is_empty() {
base_html
} else {
clean_html_elements(&base_html, ignore_list)
}
};
let base_html = if c.readability {
let url = input.url.unwrap_or(&EXAMPLE_URL).clone();
let bytes = base_html.as_bytes().to_vec();
match llm_readability::extractor::extract_async(bytes, url).await {
Ok(product) => product.content,
Err(_) => base_html,
}
} else {
base_html
};
let base_html = if c.clean_html {
clean_html(&base_html)
} else {
base_html
};
let tag_factory: Option<HashSet<String>> = input
.ignore_tags
.map(|ignore| build_ignore_set_from_strs(ignore));
match c.return_format {
ReturnFormat::Empty => String::new(),
ReturnFormat::Screenshot => {
#[cfg(feature = "screenshot")]
{
screenshot_base64_urlsafe(input.screenshot_bytes)
}
#[cfg(not(feature = "screenshot"))]
{
String::new()
}
}
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => {
html2md::rewrite_html_custom_with_url_streaming(
&base_html,
&tag_factory,
true,
&input.url.cloned(),
)
.await
}
ReturnFormat::Markdown => {
html2md::rewrite_html_custom_with_url_streaming(
&base_html,
&tag_factory,
false,
&input.url.cloned(),
)
.await
}
ReturnFormat::Html2Text => {
if !base_html.is_empty() {
crate::html2text::from_read(base_html.as_bytes(), base_html.len())
} else {
base_html
}
}
ReturnFormat::Text => {
super::text_extract::extract_text_streaming(&base_html, &tag_factory).await
}
ReturnFormat::XML => convert_html_to_xml(
base_html.trim(),
input
.url
.map(|u| u.as_str())
.unwrap_or(EXAMPLE_URL.as_str()),
&input.encoding.map(|s| s.to_string()),
)
.unwrap_or_default(),
}
}
#[inline]
pub fn transform_content_input(input: TransformInput<'_>, c: &TransformConfig) -> String {
use std::collections::HashSet;
let base_html =
get_html_with_selector_bytes(input.content, input.encoding, input.selector_config);
if is_binary_file(input.content) {
#[cfg(feature = "document")]
{
if let Some(md) = crate::transformation::document::try_convert_document(input.content) {
return md;
}
}
#[cfg(feature = "audio")]
{
if let Some(md) = crate::transformation::audio::try_convert_audio(input.content) {
return md;
}
}
return base_html;
}
let base_html = {
let mut ignore_list = build_static_vector(c);
if let Some(ignore) = input.ignore_tags {
ignore_list.extend(ignore.iter().copied());
}
if ignore_list.is_empty() {
base_html
} else {
clean_html_elements(&base_html, ignore_list)
}
};
let base_html = if c.readability {
match llm_readability::extractor::extract(
&mut base_html.as_bytes(),
input.url.unwrap_or(&EXAMPLE_URL),
) {
Ok(product) => product.content,
Err(_) => base_html,
}
} else {
base_html
};
let base_html = if c.clean_html {
clean_html(&base_html)
} else {
base_html
};
let tag_factory: Option<HashSet<String>> = input
.ignore_tags
.map(|ignore| build_ignore_set_from_strs(ignore));
match c.return_format {
ReturnFormat::Empty => String::new(),
ReturnFormat::Screenshot => {
#[cfg(feature = "screenshot")]
{
screenshot_base64_urlsafe(input.screenshot_bytes)
}
#[cfg(not(feature = "screenshot"))]
{
String::new()
}
}
ReturnFormat::Raw | ReturnFormat::Bytes => base_html,
ReturnFormat::CommonMark => html2md::rewrite_html_custom_with_url(
&base_html,
&tag_factory,
true,
&input.url.cloned(),
),
ReturnFormat::Markdown => html2md::rewrite_html_custom_with_url(
&base_html,
&tag_factory,
false,
&input.url.cloned(),
),
ReturnFormat::Html2Text => {
if !base_html.is_empty() {
crate::html2text::from_read(base_html.as_bytes(), base_html.len())
} else {
base_html
}
}
ReturnFormat::Text => super::text_extract::extract_text(&base_html, &tag_factory),
ReturnFormat::XML => convert_html_to_xml(
base_html.trim(),
input
.url
.map(|u| u.as_str())
.unwrap_or(EXAMPLE_URL.as_str()),
&input.encoding.map(|s| s.to_string()),
)
.unwrap_or_default(),
}
}
#[cfg(feature = "screenshot")]
#[inline]
fn screenshot_base64_urlsafe(screenshot_bytes: Option<&[u8]>) -> String {
use base64::{engine::general_purpose, Engine as _};
let Some(bytes) = screenshot_bytes else {
return String::new();
};
let cap = base64::encoded_len(bytes.len(), true).unwrap_or(0);
let mut out = String::with_capacity(cap);
general_purpose::URL_SAFE.encode_string(bytes, &mut out);
out
}
pub fn transform_content_to_bytes(
res: &Page,
c: &TransformConfig,
encoding: &Option<String>,
selector_config: &Option<SelectorConfiguration>,
ignore_tags: &Option<Vec<String>>,
) -> Vec<u8> {
let html_raw = get_html_bytes_safe(res);
if res.binary_file || is_binary_file(&html_raw) {
#[cfg(feature = "document")]
{
if let Some(md) = crate::transformation::document::try_convert_document(&html_raw) {
return md.into_bytes();
}
}
#[cfg(feature = "audio")]
{
if let Some(md) = crate::transformation::audio::try_convert_audio(&html_raw) {
return md.into_bytes();
}
}
let b = res.get_bytes();
if let Some(b) = b {
b.to_vec()
} else {
Default::default()
}
} else {
transform_content(res, c, encoding, selector_config, ignore_tags).into()
}
}