kalosm_language/context/
rss.rs

1use rss::Channel;
2use url::Url;
3
4use super::document::{Document, IntoDocuments};
5
6/// An error that can occur when interacting with an RSS feed.
7#[derive(Debug, thiserror::Error)]
8pub enum RssFeedError {
9    /// An error occurred when fetching the RSS feed.
10    #[error("Failed to fetch RSS feed: {0}")]
11    FetchFeed(#[from] reqwest::Error),
12    /// An error parsing the RSS feed.
13    #[error("Failed to parse RSS feed: {0}")]
14    ParseFeed(#[from] rss::Error),
15}
16
17/// A RSS feed that can be used to add documents to a search index.
18///
19/// # Example
20/// ```rust, no_run
21/// use kalosm_language::prelude::*;
22///
23/// #[tokio::main]
24/// async fn main() {
25///     let feed = RssFeed::new(
26///         url::Url::parse("https://www.nytimes.com/services/xml/rss/nyt/HomePage.xml").unwrap(),
27///     );
28///     let documents = feed.read_top_n(5).await.unwrap();
29///     println!("Documents: {:?}", documents);
30/// }
31/// ```
32#[derive(Debug, Clone, PartialEq)]
33pub struct RssFeed(Url);
34
35impl From<Url> for RssFeed {
36    fn from(url: Url) -> Self {
37        Self::new(url)
38    }
39}
40
41impl IntoDocuments for RssFeed {
42    type Error = RssFeedError;
43
44    async fn into_documents(self) -> Result<Vec<Document>, Self::Error> {
45        self.read_top_n(usize::MAX).await
46    }
47}
48
49impl RssFeed {
50    /// Create a new RSS feed from the given URL.
51    pub fn new(url: Url) -> Self {
52        Self(url)
53    }
54
55    /// Get the URL of the RSS feed.
56    pub fn url(&self) -> &Url {
57        &self.0
58    }
59
60    /// Read the top N documents from the RSS feed.
61    pub async fn read_top_n(&self, top_n: usize) -> Result<Vec<Document>, RssFeedError> {
62        let xml = reqwest::get(self.0.clone()).await?.text().await?;
63        let channel = Channel::read_from(xml.as_bytes())?;
64        let mut documents = Vec::new();
65        for item in channel.items().iter().take(top_n) {
66            let mut message = String::new();
67            if let Some(title) = item.title() {
68                message.push_str(&format!("### {}\n", title));
69            }
70            let (source_url, content) = if let Some(content) = item.content() {
71                (None, content.to_string())
72            } else if let Some(source_url) = item.link() {
73                (
74                    Some(source_url),
75                    reqwest::get(source_url).await?.text().await?,
76                )
77            } else {
78                (None, String::new())
79            };
80
81            let url = match source_url {
82                Some(url) => Url::parse(url).unwrap(),
83                None => self.0.clone(),
84            };
85
86            if let Ok(article) =
87                readability::extractor::extract(&mut std::io::Cursor::new(&content), &url)
88            {
89                documents.push(Document::from_parts(article.title, article.text));
90            }
91        }
92        Ok(documents)
93    }
94}