news-flash 3.0.1

Base library for a modern feed reader
Documentation
use super::*;
use test_log::test;

#[test]
pub fn hardwareluxx() {
    let article = "<p><img src=\"https://www.hardwareluxx.de/images/stories/2017/stadia.jpg\" alt=\"stadia\">Am vergangenen Dienstag präsentierte Google im Rahmen der Game Developers Conference in San Francisco seinen neuen  <a href=\"https://www.hardwareluxx.de/index.php/news/software/spiele/48994-googles-cloud-gaming-plattform-stadia-geht-an-den-start.html\" rel=\"noopener noreferrer\" target=\"_blank\" referrerpolicy=\"no-referrer\">Spiele-Streaming-Dienst Stadia</a> , der noch im Sommer dieses Jahres an den Start gehen soll. Auch einen eigenen  <a href=\"https://www.hardwareluxx.de/index.php/news/hardware/eingabegeraete/49001-googles-stadia-controller-mit-zwei-sondertasten-zum-heimlichen-star.html\" rel=\"noopener noreferrer\" target=\"_blank\" referrerpolicy=\"no-referrer\">Controller mit vielen interessanten Features</a>  hatte der Konzern den anwesenden Journalisten gezeigt.</p><p>Die Vorteile von Stadia liegen klar auf der Hand: Die Hardware im Rechenzentrum ist dank skalierbarer Infrastruktur schneller als jede Heimkonsole und jeder Spiele-PC zu Hause und erlaubt damit theoretisch die höchste Bildqualität. Hinzu kommt, dass langwierige Downloads und Installations-Prozesse entfallen und teure Hardware für die Nutzung des Dienstes nicht benötigt wird. Ein leistungsschwaches Notebook oder gar ein herkömmliches Smartphone sollen laut Google genügen.</p>";
    let needle = "Am vergangenen Dienstag";

    let summary = html2text(article);

    assert_eq!(needle, &summary[..needle.len()]);
}

#[test]
pub fn golem() {
    let article = "Was früher eine Auszeichnung war, ist vielen Twitter-Nutzern nun eher peinlich. (<a href=\"https://www.golem.de/specials/twitter/\">Twitter</a>, <a href=\"https://www.golem.de/specials/api/\">API</a>) <img src=\"https://cpx.golem.de/cpx.php?class=17&amp;aid=176395&amp;page=1&amp;ts=1690993502\" alt=\"\" width=\"1\" height=\"1\" />";
    let needle = "Was früher eine Auszeichnung war, ist vielen Twitter-Nutzern nun eher peinlich. (Twitter, API)";

    let summary = html2text(article);

    println!("{summary}");

    assert_eq!(needle, &summary[..needle.len()]);
}

#[test]
pub fn warandpeas() {
    let article = "<p>In an alternate universe, we’re stuck doing another job, doodling on spreadsheets instead of releasing comics. It’s truly the saddest universe we could think of and we’re determined to build a machine that gets in contact with these sad slobs to let them know their true calling.</p><p>Help us to save THIS reality by joining our Patreon! We’ll keep making comics while you’ll get loads of benefits, and a virtual high-five for saving us all!</p><p><a href=\"https://www.patreon.com/warandpeas\" target=\"_blank\" rel=\"nofollow noopener\" translate=\"no\"><span class=\"invisible\">https://www.</span><span class=\"\">patreon.com/warandpeas</span><span class=\"invisible\"></span></a></p>";
    let needle = "In an alternate universe, we’re stuck doing another job, doodling on spreadsheets instead of releasing comics. It’s truly the saddest universe we could think of and we’re determined to build a machine that gets in contact with these sad slobs to let them know their true calling.\r\n\r\nHelp us to save THIS reality by joining our Patreon! We’ll keep making comics while you’ll get loads of benefits, and a virtual high-five for saving us all!";

    let summary = html2text(article);

    println!("{summary}");

    assert_eq!(needle, &summary[..needle.len()]);
}

macro_rules! test {
    ($name:ident, $from:literal, $to:literal $(,)?) => {
    #[test]
        fn $name() {
            assert_eq!(html2text($from), $to);
            }
        };
    ($($name:ident: $from:literal to $to:literal,)*) => {
    $(test!{$name, $from, $to})*
    };
}

test! {
    plaintext: "blah" to "blah",
    emoji: "<div>abc 😇 def</div>" to "abc def",
    tag: "<div></div>" to "",
    tag_contents: "<div>simple text</div>" to "simple text",
    // Links
    link: "click <a href=\"test\">here</a>" to "click here",
    link_ignore_content_tags: "click <a href=\"test\"><span>here</span> or here</a>" to "click here or here",
    link_whitespace: "(<a href=\"https://www.golem.de/specials/twitter/\">Twitter</a>, <a href=\"https://www.golem.de/specials/api/\">API</a>)" to "(Twitter, API)",
    // Inline elements
    ignore_inline: "strong <strong>text</strong>" to "strong text",
    ignore_inline_attributes: "some <div id=\"a\" class=\"b\">div</div>" to "some div",
    // Line breaks and spaces
    collapse_spaces: "should ignore more spaces" to "should ignore more spaces",
    collapse_linebreaks: "a\nb\nc" to "a b c",
    collapse_mixed: "should \nignore \r\nnew lines" to "should ignore new lines",
    br_tag: "two<br>line<br/>breaks" to "two\r\nline\r\nbreaks",
    paragraph: "<p>two</p><p>paragraphs</p>" to "two\r\n\r\nparagraphs",
    // Headers
    h1: "<h1>First</h1>main text" to "First\r\n\r\nmain text",
    h2_inline: "First<h2>Second</h2>next section" to "First\r\n\r\nSecond\r\n\r\nnext section",
    h2: "<h2>Second</h2>next section" to "Second\r\n\r\nnext section",
    h3_inline: "Second<h3>Third</h3>next section" to "Second\r\n\r\nThird\r\n\r\nnext section",
    h3: "<h3>Third</h3>next section" to "Third\r\n\r\nnext section",
    h4_inline: "Third<h4>Fourth</h4>next section" to "Third\r\n\r\nFourth\r\n\r\nnext section",
    h4: "<h4>Fourth</h4>next section" to "Fourth\r\n\r\nnext section",
    h5_inline: "Fourth<h5>Fifth</h5>next section" to "Fourth\r\n\r\nFifth\r\n\r\nnext section",
    h5: "<h5>Fifth</h5>next section" to "Fifth\r\n\r\nnext section",
    h6_inline: "Fifth<h6>Sixth</h6>next section" to "Fifth\r\n\r\nSixth\r\n\r\nnext section",
    h6: "<h6>Sixth</h6>next section" to "Sixth\r\n\r\nnext section",
    no_h7: "<h7>Not Header</h7>next section" to "Not Headernext section",
    // HTML entities
    entity_nbsp: "two&nbsp;&nbsp;spaces" to "two\u{a0}\u{a0}spaces",
    entity_copy: "&copy; 2017 K3A" to "© 2017 K3A",
    entity_tag: "&lt;printtag&gt;" to "<printtag>",
    entity_currencies: "would you pay in &cent;, &pound;, &yen; or &euro;?" to "would you pay in ¢, £, ¥ or €?",
    ampersand_not_entity: "Tom & Jerry is not an entity" to "Tom & Jerry is not an entity",
    entity_unknown: "this &neither; as you see" to "this &neither; as you see",
    entity_amp: "fish &amp; chips" to "fish & chips",
    // Unordered list
    unordered_list: "list of items<ul><li>One</li><li>Two</li><li>Three</li></ul>" to "list of items\r\nOne\r\nTwo\r\nThree\r\n",
    entity_quot: "&quot;I'm sorry, Dave. I'm afraid I can't do that.&quot; – HAL, 2001: A Space Odyssey" to "\"I'm sorry, Dave. I'm afraid I can't do that.\" – HAL, 2001: A Space Odyssey",
    entity_reg: "Google &reg;" to "Google ®",
    // Large entity
    entity_large_unknown: "&abcdefghij;" to "&abcdefghij;",
    // Numeric HTML entities
    entity_numeric: "&#8268; decimal and hex entities supported &#x204D;" to "⁌ decimal and hex entities supported ⁍",
    entity_numeric_2: "&#39;single quotes&#39; and &#52765;" to "'single quotes' and 츝",
    // Full HTML structure
    empty: "" to "",
    full_html: "<html><head><title>Good</title></head><body>x</body>" to "x",
    ignore_script: "we are not <script type=\"javascript\"></script>interested in scripts" to "we are not interested in scripts",
    // Custom HTML tags
    ignore_unknown_tag: "<aa>hello</aa>" to "hello",
    ignore_unknown_tag_whitespace: "<aa >hello</aa>" to "hello",
    ignore_unknown_tag_attributes: "<aa x=\"1\">hello</aa>" to "hello",
    invalid_html_entity_without_semicolon: "&hellip" to "…",
}