lemmy_utils 0.19.17

use crate::{error::LemmyResult, settings::SETTINGS, LemmyErrorType};
use markdown_it::{
  parser::linkfmt::LinkFormatter,
  plugins::cmark::{
    block::fence,
    inline::{image, image::Image},
  },
  MarkdownIt,
};
use regex::RegexSet;
use std::sync::LazyLock;
use url::Url;
use urlencoding::encode;

mod link_rule;
mod spoiler_rule;

static MARKDOWN_PARSER: LazyLock<MarkdownIt> = LazyLock::new(|| {
  let mut parser = MarkdownIt::new();
  markdown_it::plugins::cmark::add(&mut parser);
  markdown_it::plugins::extra::add(&mut parser);
  spoiler_rule::add(&mut parser);
  link_rule::add(&mut parser);

  parser
});

/// Replace special HTML characters in API parameters to prevent XSS attacks.
///
/// Taken from https://github.com/OWASP/CheatSheetSeries/blob/master/cheatsheets/Cross_Site_Scripting_Prevention_Cheat_Sheet.md#output-encoding-for-html-contexts
///
/// `>` is left in place because it is interpreted as markdown quote.
pub fn sanitize_html(text: &str) -> String {
  text
    .replace('&', "&amp;")
    .replace('<', "&lt;")
    .replace('\"', "&quot;")
    .replace('\'', "&#x27;")
}

pub fn markdown_to_html(text: &str) -> String {
  MARKDOWN_PARSER.parse(text).xrender()
}

/// Rewrites all links to remote domains in markdown, so they go through `/api/v3/image_proxy`.
pub fn markdown_rewrite_image_links(mut src: String) -> (String, Vec<Url>) {
  // Use separate markdown parser here, with most features disabled for faster parsing,
  // and a dummy link formatter which doesnt normalize links.
  static PARSER: LazyLock<MarkdownIt> = LazyLock::new(|| {
    let mut p = MarkdownIt::new();
    p.link_formatter = Box::new(NoopLinkFormatter {});
    image::add(&mut p);
    fence::add(&mut p);
    link_rule::add(&mut p);
    p
  });

  let ast = PARSER.parse(&src);
  let mut links_offsets = vec![];

  // Walk the syntax tree to find positions of image links
  ast.walk(|node, _depth| {
    if let Some(image) = node.cast::<Image>() {
      // srcmap is always present for image
      // https://github.com/markdown-it-rust/markdown-it/issues/36#issuecomment-1777844387
      let node_offsets = node.srcmap.expect("srcmap is none").get_byte_offsets();
      // necessary for custom emojis which look like `![name](url "title")`
      let start_offset = node_offsets.1
        - image.url.len()
        - 1
        - image
          .title
          .as_ref()
          .map(|t| t.len() + 3)
          .unwrap_or_default();
      let end_offset = node_offsets.1 - 1;

      links_offsets.push((start_offset, end_offset));
    }
  });

  let mut links = vec![];
  // Go through the collected links in reverse order
  while let Some((start, end)) = links_offsets.pop() {
    let content = src.get(start..end).unwrap_or_default();
    // necessary for custom emojis which look like `![name](url "title")`
    let (url, extra) = if content.contains(' ') {
      let split = content.split_once(' ').expect("split is valid");
      (split.0, Some(split.1))
    } else {
      (content, None)
    };
    match Url::parse(url) {
      Ok(parsed) => {
        links.push(parsed.clone());
        // If link points to remote domain, replace with proxied link
        if parsed.domain() != Some(&SETTINGS.hostname) {
          let mut proxied = format!(
            "{}/api/v3/image_proxy?url={}",
            SETTINGS.get_protocol_and_hostname(),
            encode(url),
          );
          // restore custom emoji format
          if let Some(extra) = extra {
            proxied = format!("{proxied} {extra}");
          }
          src.replace_range(start..end, &proxied);
        }
      }
      Err(_) => {
        // If its not a valid url, replace with empty text
        src.replace_range(start..end, "");
      }
    }
  }

  (src, links)
}

pub fn markdown_check_for_blocked_urls(text: &str, blocklist: &RegexSet) -> LemmyResult<()> {
  if blocklist.is_match(text) {
    Err(LemmyErrorType::BlockedUrl)?
  }
  Ok(())
}

/// markdown-it normalizes links by default, which breaks the link rewriting. So we use a dummy
/// formatter here which does nothing. Note this isnt actually used to render the markdown.
#[derive(Debug)]
struct NoopLinkFormatter;

impl LinkFormatter for NoopLinkFormatter {
  fn validate_link(&self, _url: &str) -> Option<()> {
    Some(())
  }

  fn normalize_link(&self, url: &str) -> String {
    url.to_owned()
  }

  fn normalize_link_text(&self, url: &str) -> String {
    url.to_owned()
  }
}

#[cfg(test)]
#[allow(clippy::unwrap_used)]
#[allow(clippy::indexing_slicing)]
mod tests {

  use super::*;
  use crate::utils::validation::check_urls_are_valid;
  use pretty_assertions::assert_eq;
  use regex::escape;

  #[test]
  fn test_basic_markdown() {
    let tests: Vec<_> = vec![
      (
        "headings",
        "# h1\n## h2\n### h3\n#### h4\n##### h5\n###### h6",
        "<h1>h1</h1>\n<h2>h2</h2>\n<h3>h3</h3>\n<h4>h4</h4>\n<h5>h5</h5>\n<h6>h6</h6>\n"
      ),
      (
        "line breaks",
        "First\rSecond",
        "<p>First\nSecond</p>\n"),
      (
        "emphasis",
        "__bold__ **bold** *italic* ***bold+italic***",
        "<p><strong>bold</strong> <strong>bold</strong> <em>italic</em> <em><strong>bold+italic</strong></em></p>\n"
      ),
      (
        "blockquotes",
        "> #### Hello\n > \n > - Hola\n > - 안영 \n>> Goodbye\n",
        "<blockquote>\n<h4>Hello</h4>\n<ul>\n<li>Hola</li>\n<li>안영</li>\n</ul>\n<blockquote>\n<p>Goodbye</p>\n</blockquote>\n</blockquote>\n"
      ),
      (
        "lists (ordered, unordered)",
        "1. pen\n2. apple\n3. apple pen\n- pen\n- pineapple\n- pineapple pen",
        "<ol>\n<li>pen</li>\n<li>apple</li>\n<li>apple pen</li>\n</ol>\n<ul>\n<li>pen</li>\n<li>pineapple</li>\n<li>pineapple pen</li>\n</ul>\n"
      ),
      (
        "code and code blocks",
        "this is my amazing `code snippet` and my amazing ```code block```",
        "<p>this is my amazing <code>code snippet</code> and my amazing <code>code block</code></p>\n"
      ),
      // Links with added nofollow attribute
      (
        "links",
        "[Lemmy](https://join-lemmy.org/ \"Join Lemmy!\")",
        "<p><a href=\"https://join-lemmy.org/\" rel=\"nofollow\" title=\"Join Lemmy!\">Lemmy</a></p>\n"
      ),
      // Remote images with proxy
      (
        "images",
        "![My linked image](https://example.com/image.png \"image alt text\")",
        "<p><img src=\"https://example.com/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
      ),
      // Local images without proxy
      (
        "images",
        "![My linked image](https://lemmy-alpha/image.png \"image alt text\")",
        "<p><img src=\"https://lemmy-alpha/image.png\" alt=\"My linked image\" title=\"image alt text\" /></p>\n"
      ),
      // Ensure spoiler plugin is added
      (
        "basic spoiler",
        "::: spoiler click to see more\nhow spicy!\n:::\n",
        "<details><summary>click to see more</summary><p>how spicy!\n</p></details>\n"
      ),
      (
        "escape html special chars",
        "<script>alert('xss');</script> hello &\"",
        "<p>&lt;script&gt;alert(‘xss’);&lt;/script&gt; hello &amp;&quot;</p>\n"
      )
    ];

    tests.iter().for_each(|&(msg, input, expected)| {
      let result = markdown_to_html(input);

      assert_eq!(
        result, expected,
        "Testing {}, with original input '{}'",
        msg, input
      );
    });
  }

  #[test]
  fn test_markdown_proxy_images() {
    let tests: Vec<_> =
      vec![
        (
          "remote image proxied",
          "![link](http://example.com/image.jpg)",
          "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)",
        ),
        (
          "local image unproxied",
          "![link](http://lemmy-alpha/image.jpg)",
          "![link](http://lemmy-alpha/image.jpg)",
        ),
        (
          "multiple image links",
          "![link](http://example.com/image1.jpg) ![link](http://example.com/image2.jpg)",
          "![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage1.jpg) ![link](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage2.jpg)",
        ),
        (
          "empty link handled",
          "![image]()",
          "![image]()"
        ),
        (
          "empty label handled",
          "![](http://example.com/image.jpg)",
          "![](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
        ),
        (
          "invalid image link removed",
          "![image](http-not-a-link)",
          "![image]()"
        ),
        (
          "label with nested markdown handled",
          "![a *b* c](http://example.com/image.jpg)",
          "![a *b* c](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2Fimage.jpg)"
        ),
        (
          "custom emoji support",
          r#"![party-blob](https://www.hexbear.net/pictrs/image/83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#,
          r#"![party-blob](https://lemmy-alpha/api/v3/image_proxy?url=https%3A%2F%2Fwww.hexbear.net%2Fpictrs%2Fimage%2F83405746-0620-4728-9358-5f51b040ffee.gif "emoji party-blob")"#
        ),
        (
          "image with special chars",
          "ითხოვს ![ითხოვს](http://example.com/ითხოვს%C3%A4.jpg)",
          "ითხოვს ![ითხოვს](https://lemmy-alpha/api/v3/image_proxy?url=http%3A%2F%2Fexample.com%2F%E1%83%98%E1%83%97%E1%83%AE%E1%83%9D%E1%83%95%E1%83%A1%25C3%25A4.jpg)",
        ),
      ];

    tests.iter().for_each(|&(msg, input, expected)| {
      let result = markdown_rewrite_image_links(input.to_string());

      assert_eq!(
        result.0, expected,
        "Testing {}, with original input '{}'",
        msg, input
      );
    });
  }

  // This replicates the logic when saving url blocklist patterns and querying them.
  // Refer to lemmy_api_crud::site::update::update_site and
  // lemmy_api_common::utils::get_url_blocklist().
  fn create_url_blocklist_test_regex_set(patterns: Vec<&str>) -> LemmyResult<RegexSet> {
    let url_blocklist = patterns.iter().map(|&s| s.to_string()).collect();
    let valid_urls = check_urls_are_valid(&url_blocklist)?;
    let regexes = valid_urls.iter().map(|p| format!(r"\b{}\b", escape(p)));
    let set = RegexSet::new(regexes)?;
    Ok(set)
  }

  #[test]
  fn test_url_blocking() -> LemmyResult<()> {
    let set = create_url_blocklist_test_regex_set(vec!["example.com/"])?;

    assert!(
      markdown_check_for_blocked_urls(&String::from("[](https://example.com)"), &set).is_err()
    );

    assert!(markdown_check_for_blocked_urls(
      &String::from("Go to https://example.com to get free Robux"),
      &set
    )
    .is_err());

    assert!(
      markdown_check_for_blocked_urls(&String::from("[](https://example.blog)"), &set).is_ok()
    );

    assert!(markdown_check_for_blocked_urls(&String::from("example.com"), &set).is_err());

    assert!(markdown_check_for_blocked_urls(
      "Odio exercitationem culpa sed sunt
      et. Sit et similique tempora deserunt doloremque. Cupiditate iusto
      repellat et quis qui. Cum veritatis facere quasi repellendus sunt
      eveniet nemo sint. Cumque sit unde est. https://example.com Alias
      repellendus at quos.",
      &set
    )
    .is_err());

    let set = create_url_blocklist_test_regex_set(vec!["example.com/spam.jpg"])?;
    assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg)", &set).is_err());
    assert!(markdown_check_for_blocked_urls("![](https://example.com/spam.jpg1)", &set).is_ok());
    // TODO: the following should not be matched, scunthorpe problem.
    assert!(
      markdown_check_for_blocked_urls("![](https://example.com/spam.jpg.html)", &set).is_err()
    );

    let set = create_url_blocklist_test_regex_set(vec![
      r"quo.example.com/",
      r"foo.example.com/",
      r"bar.example.com/",
    ])?;

    assert!(markdown_check_for_blocked_urls("https://baz.example.com", &set).is_ok());

    assert!(markdown_check_for_blocked_urls("https://bar.example.com", &set).is_err());

    let set = create_url_blocklist_test_regex_set(vec!["example.com/banned_page"])?;

    assert!(markdown_check_for_blocked_urls("https://example.com/page", &set).is_ok());

    let set = create_url_blocklist_test_regex_set(vec!["ex.mple.com/"])?;

    assert!(markdown_check_for_blocked_urls("example.com", &set).is_ok());

    let set = create_url_blocklist_test_regex_set(vec!["rt.com/"])?;

    assert!(markdown_check_for_blocked_urls("deviantart.com", &set).is_ok());
    assert!(markdown_check_for_blocked_urls("art.com.example.com", &set).is_ok());
    assert!(markdown_check_for_blocked_urls("https://rt.com/abc", &set).is_err());
    assert!(markdown_check_for_blocked_urls("go to rt.com.", &set).is_err());
    assert!(markdown_check_for_blocked_urls("check out rt.computer", &set).is_ok());
    // TODO: the following should not be matched, scunthorpe problem.
    assert!(markdown_check_for_blocked_urls("rt.com.example.com", &set).is_err());

    Ok(())
  }

  #[test]
  fn test_sanitize_html() {
    let sanitized = sanitize_html("<script>alert('xss');</script> hello &\"'");
    let expected = "&lt;script>alert(&#x27;xss&#x27;);&lt;/script> hello &amp;&quot;&#x27;";
    assert_eq!(expected, sanitized);

    let sanitized =
      sanitize_html("Polling the group: what do y'all know about the Orion browser from Kagi?");
    let expected = "Polling the group: what do y&#x27;all know about the Orion browser from Kagi?";
    assert_eq!(expected, sanitized);
  }
}