html2md/lib.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
use extended::sifter::WhitespaceSifterBytes;
use lazy_static::lazy_static;
pub use markup5ever_rcdom::{Handle, NodeData, RcDom};
use regex::Regex;
use std::collections::HashSet;
use url::Url;
// we want to just use the rewriter instead for v0.1.
pub mod extended;
pub mod rewriter;
pub mod scraper;
use extended::sifter::WhitespaceSifter;
pub use scraper::ignore;
pub use scraper::{
parse_html, parse_html_custom, parse_html_custom_base, parse_html_custom_with_url,
parse_html_extended,
};
lazy_static! {
static ref MARKDOWN_MIDDLE_KEYCHARS: Regex = Regex::new(r"[<>*\\_~]").expect("valid regex pattern"); // for Markdown escaping
static ref MARKDOWN_MIDDLE_KEYCHARS_SET: regex::RegexSet = regex::RegexSet::new(&[
r"[<>*\\_~]", // Matches any single markdown character
r" " // Matches the entire " " string
]).expect("valid regex set");
}
/// Main function of this library to come. Rewrites incoming HTML, converts it into Markdown
/// and returns converted string. Incomplete work in progress for major performance increases.
/// # Arguments
/// `html` is source HTML as `String`
pub fn rewrite_html(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown(html, &None, commonmark, &None).unwrap_or_default()
}
/// Main function of this library async streaming. Rewrites incoming HTML, converts it into Markdown
/// and returns converted string. Incomplete work in progress for major performance increases.
/// # Arguments
/// `html` is source HTML as `String`
#[cfg(feature = "tokio")]
pub async fn rewrite_html_streaming(html: &str, commonmark: bool) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &None, commonmark, &None)
.await
.unwrap_or_default()
}
/// Custom variant of rewrite function.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
#[cfg(feature = "tokio")]
pub fn rewrite_html_custom_with_url(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
rewriter::writer::convert_html_to_markdown(html, &custom, commonmark, url).unwrap_or_default()
}
/// Custom variant of rewrite function.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
/// `chunk_size` the chunk size to use.
#[cfg(feature = "tokio")]
pub async fn rewrite_html_custom_with_url_and_chunk(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
chunk_size: usize,
) -> String {
rewriter::writer::convert_html_to_markdown_send_with_size(
html, &custom, commonmark, url, chunk_size,
)
.await
.unwrap_or_default()
}
/// Custom variant of rewrite function streaming async.
///
/// You can also override standard tag handlers this way
/// # Arguments
/// `html` is source HTML as `String`
/// `custom` is custom tag hadler producers for tags you want, can be empty
/// `commonmark` is for adjusting markdown output to commonmark
/// `url` is used to provide absolute url handling
#[cfg(feature = "tokio")]
pub async fn rewrite_html_custom_with_url_streaming(
html: &str,
custom: &Option<HashSet<String>>,
commonmark: bool,
url: &Option<Url>,
) -> String {
rewriter::writer::convert_html_to_markdown_send(html, &custom, commonmark, url)
.await
.unwrap_or_default()
}
/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
pub fn clean_markdown(input: &str) -> String {
input.sift()
}
/// Called after all processing has been finished
///
/// Clears excessive punctuation that would be trimmed by renderer anyway
pub fn clean_markdown_bytes(input: &Vec<u8>) -> String {
input.sift_bytes()
}
/// Replace the markdown chars cleanly.
pub fn replace_markdown_chars(input: &str) -> String {
use crate::MARKDOWN_MIDDLE_KEYCHARS_SET;
if !MARKDOWN_MIDDLE_KEYCHARS_SET.is_match(input) {
return input.to_string();
}
let mut output = String::new();
let mut chars = input.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '&' {
let mut entity = String::new();
entity.push(ch);
while let Some(&next_ch) = chars.peek() {
entity.push(next_ch);
chars.next();
if entity == " " {
entity.clear(); // discard
break;
} else if next_ch == ';' || entity.len() > 6 {
output.push_str(&entity);
break;
}
}
if !entity.is_empty() {
output.push_str(&entity);
}
} else if "<>*\\_~".contains(ch) {
output.push('\\');
output.push(ch);
} else {
output.push(ch);
}
}
output
}