use std::path::PathBuf;
use chrono::{DateTime, NaiveDate, NaiveDateTime, Utc};
use color_eyre::eyre::eyre;
use html2md::parse_html;
use regex::Regex;
use roxmltree::Node;
use slug::slugify;
use tracing::error;
use crate::core::{feed::feedentry::FeedEntry, library::feeditem::FeedItem};
pub fn get_feed(url: &str) -> color_eyre::Result<FeedItem> {
let response = reqwest::blocking::get(url)?;
if !response.status().is_success() {
return Err(eyre!(
"Request to \"{}\" returned status code {:?}",
url,
response.status()
));
}
let body = response.text()?;
parse(&body, url)
}
fn parse(doc: &str, feed_url: &str) -> color_eyre::Result<FeedItem> {
let mut feed = FeedItem::default();
let doc = roxmltree::Document::parse(doc)?;
let mut feed_tag = doc.root();
if feed_tag.tag_name().name() == "rss" {
feed_tag = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "channel")
.unwrap();
}
feed.title = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "title")
.and_then(|t| t.text().map(|s| s.trim()))
.unwrap_or("")
.to_string();
feed.description = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "description" || t.tag_name().name() == "subtitle")
.and_then(|t| t.text())
.unwrap_or(&feed.title)
.to_string();
feed.url = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "link")
.and_then(|t| {
if t.text().is_none() {
t.attribute("href")
} else {
t.text()
}
})
.unwrap_or(feed_url)
.to_string();
feed.feed_url = feed_url.to_string();
if let Some(author_tag) = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "author")
{
if let Some(nametag) = author_tag
.descendants()
.find(|t| t.tag_name().name() == "name")
.and_then(|t| t.text())
{
feed.author = String::from(nametag);
} else if let Some(text) = author_tag.text() {
feed.author = String::from(text);
} else {
feed.author = feed.title.to_string();
}
} else {
feed.author = feed.title.to_string();
}
feed.slug = slugify(&feed.title);
feed.lastupdated = Utc::now();
Ok(feed)
}
pub fn get_feed_entries(feed: &FeedItem) -> color_eyre::Result<Vec<FeedEntry>> {
let response = reqwest::blocking::get(&feed.feed_url)?;
if !response.status().is_success() {
return Err(eyre!(
"Request to \"{}\" returned status code {:?}",
feed.feed_url,
response.status()
));
}
let body = response.text()?;
get_feed_entries_doc(&body, &feed.author)
}
pub fn get_feed_entries_doc(
doctxt: &str,
defaultauthor: &str,
) -> color_eyre::Result<Vec<FeedEntry>> {
let doc = roxmltree::Document::parse(doctxt)?;
let mut feed_tag = doc.root();
if feed_tag.tag_name().name() == "rss" {
feed_tag = feed_tag
.descendants()
.find(|t| t.tag_name().name() == "channel")
.unwrap();
}
let mut feedentries = Vec::<FeedEntry>::new();
for entry in feed_tag
.descendants()
.filter(|t| t.tag_name().name() == "item" || t.tag_name().name() == "entry")
{
let (desc, content) = get_description_content(&entry);
let datestr = entry
.descendants()
.find(|t| {
t.tag_name().name() == "published"
|| t.tag_name().name() == "updated"
|| t.tag_name().name() == "date"
|| t.tag_name().name() == "pubDate"
})
.and_then(|t| t.text())
.unwrap_or("1990-09-19")
.to_string();
let entryauthor: String = if let Some(author_tag) = entry
.descendants()
.find(|t| t.tag_name().name() == "author" || t.tag_name().name() == "creator")
{
if let Some(nametag) = author_tag
.descendants()
.find(|t| t.tag_name().name() == "name")
.and_then(|t| t.text())
{
String::from(nametag)
} else if let Some(text) = author_tag.text() {
String::from(text)
} else {
defaultauthor.to_string()
}
} else {
defaultauthor.to_string()
};
let entryurl = entry
.descendants()
.find(|t| t.tag_name().name() == "id" || t.tag_name().name() == "link")
.and_then(|t| {
if t.text().is_none() {
t.attribute("href")
} else {
t.text()
}
})
.unwrap_or("NOURL")
.to_string();
let fe = FeedEntry {
title: entry
.descendants()
.find(|t| t.tag_name().name() == "title")
.and_then(|t| t.text())
.unwrap_or("NOTITLE")
.to_string(),
author: entryauthor,
url: entryurl,
text: content,
date: parse_date(&datestr)
.map_err(|err| error!("{:?}", err))
.unwrap_or_default(),
description: desc,
lastupdated: Utc::now(),
seen: false,
filepath: PathBuf::default(),
};
feedentries.push(fe);
}
Ok(feedentries)
}
fn parse_date(date_str: &str) -> color_eyre::Result<DateTime<Utc>> {
if let Ok(dt) = DateTime::parse_from_rfc3339(date_str) {
return Ok(dt.with_timezone(&Utc));
}
if let Ok(dt) = DateTime::parse_from_rfc2822(date_str) {
return Ok(dt.with_timezone(&Utc));
}
let format_naive_datetime = "%Y-%m-%d %H:%M:%S";
if let Ok(naive) = NaiveDateTime::parse_from_str(date_str, format_naive_datetime) {
return Ok(DateTime::<Utc>::from_naive_utc_and_offset(naive, Utc));
}
let format_naive_date = "%Y-%m-%d";
if let Ok(naive_date) = NaiveDate::parse_from_str(date_str, format_naive_date) {
if let Some(naive_datetime) = naive_date.and_hms_opt(0, 0, 0) {
return Ok(DateTime::<Utc>::from_naive_utc_and_offset(
naive_datetime,
Utc,
));
}
}
Err(eyre!("Couldn't parse date: {:?}", date_str))
}
fn get_description_content(entry: &Node) -> (String, String) {
let content = entry
.descendants()
.find(|t| t.tag_name().name() == "content" || t.tag_name().name() == "encoded")
.and_then(|t| t.text().map(|s| s.replace(['\n', '\r'], "")));
let description = entry
.descendants()
.find(|t| t.tag_name().name() == "description" || t.tag_name().name() == "summary")
.and_then(|t| t.text().map(|s| s.replace(['\n', '\r'], "")));
let content_text = match content.as_ref() {
Some(text) => parse_html(text),
None => match description.as_ref() {
Some(desc) => parse_html(desc),
None => String::new(),
},
};
let description_text = match description {
Some(text) => parse_html(&text)
.replace("\n", "")
.chars()
.take(280)
.collect::<String>(),
None => content_text
.replace("\n", "")
.chars()
.take(280)
.collect::<String>(),
};
(strip_markdown_tags(&description_text), content_text)
}
fn strip_markdown_tags(input: &str) -> String {
let patterns = [
r"\*\*(.*?)\*\*", r"\*(.*?)\*", r"`(.*?)`", r"~~(.*?)~~", r"#+\s*", r"!\[(.*?)\]\(.*?\)", r"\[(.*?)\]\(.*?\)", r">+\s*", r"[-*_=]{3,}", r"`{3}.*?`{3}", ];
let mut result = input.to_string();
for pat in patterns.iter() {
let re = Regex::new(pat).unwrap();
result = re.replace_all(&result, "$1").to_string();
}
result
}
#[cfg(test)]
mod tests {
use chrono::TimeZone;
use super::*;
#[test]
fn test_strip_markdown_tags() {
let input = "**bold** *italic* `code` ~~strike~~ [link](url)  # heading > blockquote\n---\n";
let expected = "bold italic code strike link image heading blockquote\n\n";
assert_eq!(strip_markdown_tags(input), expected);
}
#[test]
fn test_parse_date_various_formats() {
let datetime_strings = [
"2024-01-01T12:00:00Z", "2024-01-01T13:00:00+01:00", "2024-02-29 09:00:00", "2023-11-20", "Mon, 01 Jan 2024 12:00:00 +0000", "Invalid Date String", ];
let expected = [
Some(
DateTime::parse_from_rfc3339("2024-01-01T12:00:00+00:00")
.unwrap()
.with_timezone(&Utc),
),
Some(
DateTime::parse_from_rfc3339("2024-01-01T12:00:00+00:00")
.unwrap()
.with_timezone(&Utc),
), Some(Utc.with_ymd_and_hms(2024, 2, 29, 9, 0, 0).unwrap()),
Some(Utc.with_ymd_and_hms(2023, 11, 20, 0, 0, 0).unwrap()),
Some(Utc.with_ymd_and_hms(2024, 1, 1, 12, 0, 0).unwrap()),
None,
];
for (input, expected_str) in datetime_strings.iter().zip(expected.iter()) {
let result = parse_date(input);
match expected_str {
Some(exp) => match result {
Ok(ref dt) => assert_eq!(dt, exp, "Failed on input: {input}"),
Err(e) => panic!("Expected Ok for input: {input} - Error: {e}"),
},
None => assert!(result.is_err(), "Expected error for input: {input}"),
}
}
}
#[test]
fn parses_rss2_channel_fields() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Example RSS</title>
<link>https://example.com/</link>
<description>RSS description</description>
<author>Alice</author>
<item>
<title>Item 1</title>
<link>https://example.com/item1</link>
<description>Item 1 description</description>
<author>alice@example.com (Alice)</author>
</item>
</channel>
</rss>"#;
let feed = parse(xml, "NOURL").expect("failed to parse RSS 2.0");
assert_eq!(feed.title, "Example RSS");
assert_eq!(feed.description, "RSS description");
assert_eq!(feed.url, "https://example.com/");
assert!(feed.author.contains("Alice"));
}
#[test]
fn parses_atom_feed_fields() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Atom</title>
<subtitle>Atom description</subtitle>
<link href="https://example.org/"/>
<author>
<name>Bob</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
<updated>2003-12-13T18:30:02Z</updated>
</feed>"#;
let feed = parse(xml, "NOURL").expect("failed to parse Atom");
assert_eq!(feed.title, "Example Atom");
assert_eq!(feed.description, "Atom description");
assert_eq!(feed.url, "https://example.org/");
assert_eq!(feed.author, "Bob");
}
#[test]
fn rss_missing_link_uses_default_url() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>No Link RSS</title>
<description>No link here</description>
<author>Carol</author>
</channel>
</rss>"#;
let feed = parse(xml, "NOURL").expect("failed to parse RSS without link");
assert_eq!(feed.title, "No Link RSS");
assert_eq!(feed.description, "No link here");
assert_eq!(feed.url, "NOURL");
assert!(feed.author.contains("Carol"));
}
#[test]
fn rss_missing_author_uses_feed_title() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>No Author RSS</title>
<description>No author here</description>
</channel>
</rss>"#;
let feed = parse(xml, "NOURL").expect("failed to parse RSS without author");
assert_eq!(feed.title, "No Author RSS");
assert_eq!(feed.description, "No author here");
assert_eq!(feed.author, "No Author RSS");
}
#[test]
fn get_feed_entries_doc_parses_rss_items_variants() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Example RSS</title>
<link>https://example.com/</link>
<description>RSS description</description>
<author>Carol</author>
<item>
<title>Item A</title>
<link>https://example.com/a</link>
<description>Item A description</description>
<pubDate>Mon, 01 Jan 2024 12:00:00 +0000</pubDate>
<content:encoded>Item A content</content:encoded>
</item>
<item>
<title>Item B</title>
<id>https://example.com/b</id>
<dc:date>2024-03-10T09:30:00Z</dc:date>
<description>Item B description</description>
</item>
</channel>
</rss>"#;
let entries = get_feed_entries_doc(xml, "Carol").expect("failed to parse RSS entries");
assert_eq!(entries.len(), 2);
let a = &entries[0];
assert_eq!(a.title, "Item A");
assert_eq!(a.url, "https://example.com/a");
assert_eq!(a.author, "Carol");
assert_eq!(a.text, "Item A content");
assert_eq!(a.description, "Item A description");
let expected_a_date = parse_date("Mon, 01 Jan 2024 12:00:00 +0000").unwrap();
assert_eq!(a.date, expected_a_date);
let b = &entries[1];
assert_eq!(b.title, "Item B");
assert_eq!(b.url, "https://example.com/b");
assert_eq!(b.author, "Carol");
assert_eq!(b.text, "Item B description");
assert_eq!(b.description, "Item B description");
let expected_b_date = DateTime::parse_from_rfc3339("2024-03-10T09:30:00Z")
.unwrap()
.with_timezone(&Utc);
assert_eq!(b.date, expected_b_date);
}
#[test]
fn get_feed_entries_doc_parses_atom_entries_variants() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Atom</title>
<link href="https://example.org/"/>
<author>
<name>Bob</name>
</author>
<id>urn:uuid:feedid</id>
<updated>2024-01-01T00:00:00Z</updated>
<entry>
<title>Entry 1</title>
<id>https://example.org/e1</id>
<summary>Summary 1</summary>
<content>Entry 1 content</content>
<published>2024-02-01T10:00:00Z</published>
</entry>
<entry>
<title>Entry 2</title>
<id>https://example.org/e2</id>
<content>Entry 2 content</content>
<updated>2024-02-05T11:30:00Z</updated>
<author>
<name>Alice</name>
</author>
</entry>
<entry>
<title>Entry 3</title>
<link rel="alternate" href="https://example.org/e3" type="text/html"/>
<id>https://example.org/e3</id>
<content>Entry 3 content</content>
<updated>2024-02-05T11:30:00Z</updated>
<author>
<name>Alice</name>
</author>
</entry>
</feed>"#;
let entries = get_feed_entries_doc(xml, "Bob").expect("failed to parse Atom entries");
assert_eq!(entries.len(), 3);
let e1 = &entries[0];
assert_eq!(e1.title, "Entry 1");
assert_eq!(e1.url, "https://example.org/e1");
assert_eq!(e1.author, "Bob");
assert_eq!(e1.text, "Entry 1 content");
assert_eq!(e1.description, "Summary 1");
let expected_e1_date = DateTime::parse_from_rfc3339("2024-02-01T10:00:00Z")
.unwrap()
.with_timezone(&Utc);
assert_eq!(e1.date, expected_e1_date);
let e2 = &entries[1];
assert_eq!(e2.title, "Entry 2");
assert_eq!(e2.url, "https://example.org/e2");
assert_eq!(e2.author, "Alice");
assert_eq!(e2.text, "Entry 2 content");
assert_eq!(e2.description, "Entry 2 content");
let expected_e2_date = DateTime::parse_from_rfc3339("2024-02-05T11:30:00Z")
.unwrap()
.with_timezone(&Utc);
assert_eq!(e2.date, expected_e2_date);
let e3 = &entries[2];
assert_eq!(e3.title, "Entry 3");
assert_eq!(e3.url, "https://example.org/e3");
assert_eq!(e3.author, "Alice");
assert_eq!(e3.text, "Entry 3 content");
assert_eq!(e3.description, "Entry 3 content");
let expected_e3_date = DateTime::parse_from_rfc3339("2024-02-05T11:30:00Z")
.unwrap()
.with_timezone(&Utc);
assert_eq!(e3.date, expected_e3_date);
}
#[test]
fn get_feed_entries_doc_parses_atom_entry_level_author_overrides_feed() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Example Atom</title>
<link href="https://example.org/"/>
<author>
<name>Feed Author</name>
</author>
<id>urn:uuid:feedid</id>
<updated>2024-01-01T00:00:00Z</updated>
<entry>
<title>Entry Has Own Author</title>
<id>https://example.org/own</id>
<author>
<name>Alice</name>
</author>
<content>Own author content</content>
<published>2024-02-01T10:00:00Z</published>
</entry>
<entry>
<title>Entry Falls Back To Feed Author</title>
<id>https://example.org/fallback</id>
<content>No entry author here</content>
<updated>2024-02-05T11:30:00Z</updated>
</entry>
</feed>"#;
let entries = get_feed_entries_doc(xml, "Feed Author")
.expect("failed to parse Atom entries with entry-level authors");
assert_eq!(entries.len(), 2);
let e1 = &entries[0];
assert_eq!(e1.title, "Entry Has Own Author");
assert_eq!(e1.url, "https://example.org/own");
assert_eq!(e1.author, "Alice");
let e2 = &entries[1];
assert_eq!(e2.title, "Entry Falls Back To Feed Author");
assert_eq!(e2.url, "https://example.org/fallback");
assert_eq!(e2.author, "Feed Author"); }
#[test]
fn get_feed_entries_doc_parses_rss_item_level_author_overrides_channel() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:dc="http://purl.org/dc/elements/1.1/">
<channel>
<title>Example RSS</title>
<link>https://example.com/</link>
<description>RSS description</description>
<author>Channel Author</author>
<item>
<title>Item With Author</title>
<link>https://example.com/with-author</link>
<description>Has its own author</description>
<author>Alice</author>
<pubDate>Mon, 01 Jan 2024 12:00:00 +0000</pubDate>
</item>
<item>
<title>Item With DC Creator</title>
<link>https://example.com/with-dc-creator</link>
<description>Has dc:creator</description>
<dc:creator>Dave</dc:creator>
<dc:date>2024-02-01T10:00:00Z</dc:date>
</item>
</channel>
</rss>"#;
let entries = get_feed_entries_doc(xml, "Channel Author")
.expect("failed to parse RSS entries with entry-level authors");
assert_eq!(entries.len(), 2);
let a = &entries[0];
assert_eq!(a.title, "Item With Author");
assert_eq!(a.url, "https://example.com/with-author");
assert_eq!(a.author, "Alice");
let b = &entries[1];
assert_eq!(b.title, "Item With DC Creator");
assert_eq!(b.url, "https://example.com/with-dc-creator");
assert_eq!(b.author, "Dave"); }
}