use crate::{error::LemmyResult, settings::SETTINGS, LemmyErrorType};
use markdown_it::{
parser::linkfmt::LinkFormatter,
plugins::cmark::{
block::fence,
inline::{image, image::Image},
},
MarkdownIt,
};
use regex::RegexSet;
use std::sync::LazyLock;
use url::Url;
use urlencoding::encode;
mod link_rule;
mod spoiler_rule;
static MARKDOWN_PARSER: LazyLock<MarkdownIt> = LazyLock::new(|| {
let mut parser = MarkdownIt::new();
markdown_it::plugins::cmark::add(&mut parser);
markdown_it::plugins::extra::add(&mut parser);
spoiler_rule::add(&mut parser);
link_rule::add(&mut parser);
parser
});
pub fn sanitize_html(text: &str) -> String {
text
.replace('&', "&")
.replace('<', "<")
.replace('\"', """)
.replace('\'', "'")
}
pub fn markdown_to_html(text: &str) -> String {
MARKDOWN_PARSER.parse(text).xrender()
}
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
static PARSER: LazyLock<MarkdownIt> = LazyLock::new(|| {
let mut p = MarkdownIt::new();
p.link_formatter = Box::new(NoopLinkFormatter {});
image::add(&mut p);
fence::add(&mut p);
link_rule::add(&mut p);
p
});
let ast = PARSER.parse(&src);
let mut links_offsets = vec![];
ast.walk(|node, _depth| {
if let Some(image) = node.cast::<Image>() {
let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
let start_offset = node_offsets.1
- image.url.len()
- 1
- image
.title
.as_ref()
.map(|t| t.len() + 3)
.unwrap_or_default();
let end_offset = node_offsets.1 - 1;
links_offsets.push((start_offset, end_offset));
}
});
let mut links = vec![];
while let Some((start, end)) = links_offsets.pop() {
let content = src.get(start..end).unwrap_or_default();
let (url, extra) = if content.contains(' ') {
let split = content.split_once(' ').expect("split is valid");
(split.0, Some(split.1))
} else {
(content, None)
};
match Url::parse(url) {
Ok(parsed) => {
links.push(parsed.clone());
if parsed.domain() != Some(&SETTINGS.hostname) {
let mut proxied = format!(
"{}/api/v3/image_proxy?url={}",
SETTINGS.get_protocol_and_hostname(),
encode(url),
);
if let Some(extra) = extra {
proxied = format!("{proxied} {extra}");
}
src.replace_range(start..end, &proxied);
}
}
Err(_) => {
src.replace_range(start..end, "");
}
}
}
(src, links)
}
pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> LemmyResult<()> {
if blocklist.is_match(text) {
Err(LemmyErrorType::BlockedUrl)?
}
Ok(())
}
#[derive(Debug)]
struct NoopLinkFormatter;
impl LinkFormatter for NoopLinkFormatter {
fn validate_link(&self, _url: &str) -> Option<()> {
Some(())
}
fn normalize_link(&self, url: &str) -> String {
url.to_owned()
}
fn normalize_link_text(&self, url: &str) -> String {
url.to_owned()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
#[allow(clippy::indexing_slicing)]
mod tests {
use super::*;
use crate::utils::validation::check_urls_are_valid;
use pretty_assertions::assert_eq;
use regex::escape;
#[test]
fn test_basic_markdown() {
let tests: Vec<_> = vec",
"<p><a href=\"https://join-lemmy.org/\" rel=\"nofollow\" title=\"Join Lemmy!\">Lemmy</a></p>\n"
),
(
"images",
"",
"<p><img src=\"https://example.com/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
),
(
"images",
"",
"<p><img src=\"https://lemmy-alpha/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
),
(
"basic spoiler",
"::: spoiler click to see more\nhow spicy!\n:::\n",
"<details><summary>click to see more</summary><p>how spicy!\n</p></details>\n"
),
(
"escape html special chars",
"<script>alert('xss');</script> hello &\"",
"<p><script>alert(‘xss’);</script> hello &"</p>\n"
)
];
tests.iter().for_each(|&(msg, input, expected)| {
let result = markdown_to_html(input);
assert_eq!(
result, expected,
"Testing {}, with original input '{}'",
msg, input
);
});
}
#[test]
fn test_markdown_proxy_images() {
let tests: Vec<_> =
vec",
"",
),
(
"local image unproxied",
"",
"",
),
(
"multiple image links",
" ",
" ",
),
(
"empty link handled",
"![image]()",
"![image]()"
),
(
"empty label handled",
"",
""
),
(
"invalid image link removed",
"",
"![image]()"
),
(
"label with nested markdown handled",
"",
""
),
(
"custom emoji support",
r#""#,
r#""#
),
(
"image with special chars",
"ითხოვს ",
"ითხოვს ",
),
];
tests.iter().for_each(|&(msg, input, expected)| {
let result = markdown_rewrite_image_links(input.to_string());
assert_eq!(
result.0, expected,
"Testing {}, with original input '{}'",
msg, input
);
});
}
fn create_url_blocklist_test_regex_set(patterns: Vec<&str>) -> LemmyResult<RegexSet> {
let url_blocklist = patterns.iter().map(|&s| s.to_string()).collect();
let valid_urls = check_urls_are_valid(&url_blocklist)?;
let regexes = valid_urls.iter().map(|p| format!(r"\b{}\b", escape(p)));
let set = RegexSet::new(regexes)?;
Ok(set)
}
#[test]
fn test_url_blocking() -> LemmyResult<()> {
let set = create_url_blocklist_test_regex_set(vec!["example.com/"])?;
assert!(
markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
);
assert!(markdown_check_for_blocked_urls(
&String::from("Go to https://example.com to get free Robux"),
&set
)
.is_err());
assert!(
markdown_check_for_blocked_urls(&String::from("[](https://example.blog)"), &set).is_ok()
);
assert!(markdown_check_for_blocked_urls(&String::from("example.com"), &set).is_err());
assert!(markdown_check_for_blocked_urls(
"Odio exercitationem culpa sed sunt
et. Sit et similique tempora deserunt doloremque. Cupiditate iusto
repellat et quis qui. Cum veritatis facere quasi repellendus sunt
eveniet nemo sint. Cumque sit unde est. https://example.com Alias
repellendus at quos.",
&set
)
.is_err());
let set = create_url_blocklist_test_regex_set(vec!["example.com/spam.jpg"])?;
assert!(markdown_check_for_blocked_urls("", &set).is_err());
assert!(markdown_check_for_blocked_urls("", &set).is_ok());
assert!(
markdown_check_for_blocked_urls("", &set).is_err()
);
let set = create_url_blocklist_test_regex_set(vec![
r"quo.example.com/",
r"foo.example.com/",
r"bar.example.com/",
])?;
assert!(markdown_check_for_blocked_urls("https://baz.example.com", &set).is_ok());
assert!(markdown_check_for_blocked_urls("https://bar.example.com", &set).is_err());
let set = create_url_blocklist_test_regex_set(vec!["example.com/banned_page"])?;
assert!(markdown_check_for_blocked_urls("https://example.com/page", &set).is_ok());
let set = create_url_blocklist_test_regex_set(vec!["ex.mple.com/"])?;
assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());
let set = create_url_blocklist_test_regex_set(vec!["rt.com/"])?;
assert!(markdown_check_for_blocked_urls("deviantart.com", &set).is_ok());
assert!(markdown_check_for_blocked_urls("art.com.example.com", &set).is_ok());
assert!(markdown_check_for_blocked_urls("https://rt.com/abc", &set).is_err());
assert!(markdown_check_for_blocked_urls("go to rt.com.", &set).is_err());
assert!(markdown_check_for_blocked_urls("check out rt.computer", &set).is_ok());
assert!(markdown_check_for_blocked_urls("rt.com.example.com", &set).is_err());
Ok(())
}
#[test]
fn test_sanitize_html() {
let sanitized = sanitize_html("<script>alert('xss');</script> hello &\"'");
let expected = "<script>alert('xss');</script> hello &"'";
assert_eq!(expected, sanitized);
let sanitized =
sanitize_html("Polling the group: what do y'all know about the Orion browser from Kagi?");
let expected = "Polling the group: what do y'all know about the Orion browser from Kagi?";
assert_eq!(expected, sanitized);
}
}