Skip to main content

news_flash/feed_api_implementations/local/
mod.rs

1mod cache;
2mod download_result;
3pub mod metadata;
4
5use self::metadata::LocalMetadata;
6use crate::error::FeedParserError;
7use crate::feed_api::FeedHeaderMap;
8use crate::models::{
9    self, ArticleID, Category, CategoryID, FatArticle, FavIcon, Feed, FeedID, FeedUpdateResult, LoginData, Marked, PluginCapabilities, Read,
10    SyncResult, TagID, Url,
11};
12use crate::util;
13use crate::util::favicons::EXPIRES_AFTER_DAYS;
14use crate::util::relative_url_evaluater::RelativeUrlEvaluater;
15use crate::util::text2html::Text2Html;
16use crate::{
17    feed_api::{FeedApi, FeedApiError, FeedApiResult, Portal},
18    models::Enclosure,
19};
20use ammonia::UrlRelative;
21use async_trait::async_trait;
22use cache::LocalRSSCache;
23use chrono::{DateTime, TimeDelta, Utc};
24use download_result::{DownloadFailure, DownloadSuccess};
25use feed_rs::model::{Entry, Link, MediaContent, MediaObject, Text};
26use feed_rs::parser::{Builder as ParserBuilder, ParseFeedError};
27use once_cell::sync::Lazy;
28use regex::{Regex, RegexBuilder};
29use reqwest::Client;
30use reqwest::header::{HeaderMap, HeaderValue};
31use std::collections::{HashMap, HashSet};
32use std::str;
33use std::sync::Arc;
34use std::time::Duration;
35use tokio::sync::RwLock;
36
37const SAME_HOST_DELAY_MS: u64 = 1500;
38const GENERAL_DELAY_MS: u64 = 100;
39
40static DELAY_HOST_BLACKLIST: Lazy<HashSet<&'static str>> = Lazy::new(|| {
41    let mut set = HashSet::new();
42    set.insert("youtube.com");
43    set.insert("medium.com");
44    set.insert("podcasts.apple.com");
45    set
46});
47
48static TAG_BLACKLIST: Lazy<HashSet<&'static str>> = Lazy::new(|| {
49    let mut tag_blacklist = HashSet::new();
50    tag_blacklist.insert("script");
51    tag_blacklist.insert("style");
52    tag_blacklist
53});
54
55static HTML_ENTITY_REGEX: Lazy<Regex> = Lazy::new(|| {
56    RegexBuilder::new(r#"&(?:[a-z\d]+|#\d+|#x[a-f\d]+)"#)
57        .case_insensitive(true)
58        .build()
59        .expect("HTML_ENTITY_REGEX regex")
60});
61
62impl From<ParseFeedError> for FeedApiError {
63    fn from(error: ParseFeedError) -> FeedApiError {
64        match error {
65            ParseFeedError::ParseError(e) => FeedApiError::Api {
66                message: format!("Error parsing feed: {e}"),
67            },
68            ParseFeedError::IoError(e) => FeedApiError::IO(e),
69            ParseFeedError::JsonSerde(e) => FeedApiError::Json {
70                source: e,
71                json: "Unavailable".into(),
72            },
73            ParseFeedError::JsonUnsupportedVersion(e) => FeedApiError::Api {
74                message: format!("Unsupported Json feed: {e}"),
75            },
76            ParseFeedError::XmlReader(e) => FeedApiError::Api {
77                message: format!("Error parsing xml: {e}"),
78            },
79        }
80    }
81}
82
83pub struct LocalRSS {
84    portal: Arc<Box<dyn Portal>>,
85    cache: Arc<RwLock<LocalRSSCache>>,
86}
87
88impl LocalRSS {
89    fn select_article_url(links: &[Link]) -> Option<Url> {
90        let mut url = links
91            .iter()
92            .find(|l| l.rel.as_deref() == Some("alternate"))
93            .and_then(|l| Url::parse(&l.href).ok());
94
95        if url.is_none() {
96            for link in links {
97                if let Ok(parsed_url) = Url::parse(&link.href) {
98                    url = Some(parsed_url);
99                    break;
100                }
101            }
102        }
103
104        url
105    }
106
107    fn sanitize_text(text: &mut Text, base_url: Option<&Url>) {
108        if text.content_type.as_str() != "text/plain" || ammonia::is_html(&text.content) || HTML_ENTITY_REGEX.is_match(&text.content) {
109            text.content = Self::sanitize_string(&text.content, base_url);
110        }
111    }
112
113    fn sanitize_string(text: &str, base_url: Option<&Url>) -> String {
114        let mut sanitizer = ammonia::Builder::default();
115        sanitizer.clean_content_tags(TAG_BLACKLIST.clone());
116        sanitizer.add_tags(&["video", "audio", "source", "iframe"]);
117        sanitizer.add_tag_attributes("source", &["src", "type"]);
118        sanitizer.add_tag_attributes("iframe", &["src"]);
119        sanitizer.add_generic_attributes(&["id"]);
120
121        if let Some(base_url) = base_url.cloned() {
122            let eval = RelativeUrlEvaluater::new(base_url);
123            sanitizer.url_relative(UrlRelative::Custom(Box::new(eval)));
124        }
125        sanitizer.clean(text).to_string()
126    }
127
128    fn convert_entry(entry: Entry, feed_id: &FeedID, website: Option<&Url>, portal: Arc<Box<dyn Portal>>) -> Option<(FatArticle, Vec<Enclosure>)> {
129        let Entry {
130            id,
131            updated,
132            mut title,
133            authors,
134            content,
135            links,
136            summary: mut entry_summary,
137            categories: _,
138            contributors: _,
139            published,
140            source: _,
141            rights: _,
142            media,
143            language: _,
144            base: _,
145        } = entry;
146
147        let mut article_id = ArticleID::new(&id);
148        let mut local_article = portal.get_article(&article_id).ok();
149
150        if local_article.as_ref().map(|a| &a.feed_id != feed_id).unwrap_or(false) {
151            // same article but published by a different feed (e.g. articles republished by planet.gnome.org)
152            tracing::debug!(%article_id, "article already published by another feed");
153            article_id = ArticleID::from_owned(format!("{id}:{feed_id}"));
154            local_article = portal.get_article(&article_id).ok();
155        }
156
157        let article_url = Self::select_article_url(&links);
158
159        let mut marked = Marked::Unmarked;
160        let mut unread = Read::Unread;
161
162        // if article exists in db and already has same timestamp as 'updated' then skip it
163        if let Some(local_article) = local_article {
164            if let Some(updated) = entry.updated {
165                if local_article.date >= updated {
166                    return None;
167                }
168            } else {
169                // we have no updated timestamp for this article and it already exists in the db
170                // so we skip this one
171                return None;
172            }
173
174            marked = local_article.marked;
175            unread = local_article.unread;
176        }
177
178        if let Some(text) = title.as_mut() {
179            Self::sanitize_text(text, website);
180        }
181        if let Some(text) = entry_summary.as_mut() {
182            Self::sanitize_text(text, website);
183        }
184
185        // FIXME: handle content-type
186        let html = match content {
187            Some(content) => {
188                let xml_base = content.src.as_ref().map(|l| l.href.clone()).and_then(|xb| Url::parse(&xb).ok());
189                let xml_base = xml_base.map(Some).unwrap_or(website.cloned());
190                content.body.map(|body| Self::sanitize_string(&body, xml_base.as_ref()))
191            }
192            None => match &entry_summary {
193                Some(original_summary) => Some(original_summary.content.clone()),
194                None => media.first().and_then(|m| m.description.clone()).map(|mut text| {
195                    Self::sanitize_text(&mut text, website);
196                    text.content
197                }),
198            },
199        };
200        let html = html.map(|s| if !ammonia::is_html(&s) { Text2Html::process(&s) } else { s });
201
202        let plain_text = match &entry_summary {
203            Some(summary) => Some(util::html2text::html2text(&summary.content)),
204            None => html.as_deref().map(util::html2text::html2text),
205        };
206        let summary = plain_text.as_deref().map(util::html2text::text2summary);
207
208        let mut thumbnail_url = media
209            .iter()
210            .filter_map(|media| {
211                let MediaObject { content, thumbnails, .. } = media;
212                let attached_images: Vec<Url> = content
213                    .iter()
214                    .filter_map(|content| {
215                        let MediaContent { url, content_type, .. } = content;
216                        content_type.as_ref().and_then(|mime| {
217                            if mime.ty() == "image" {
218                                url.as_ref().map(|url| Url::new(url.clone()))
219                            } else {
220                                None
221                            }
222                        })
223                    })
224                    .collect();
225                let thumbnails = thumbnails.iter().map(|t| t.image.uri.as_str()).collect::<Vec<&str>>();
226                if let Some(&thumbnail) = thumbnails.first() {
227                    Some(thumbnail.to_owned())
228                } else {
229                    attached_images.first().map(|first_image| first_image.to_string())
230                }
231            })
232            .collect::<Vec<String>>()
233            .first()
234            .cloned();
235        if thumbnail_url.is_none() {
236            thumbnail_url = html.as_deref().and_then(crate::util::thumbnail::extract_thumbnail);
237        }
238
239        let mut enclosures = media
240            .into_iter()
241            .flat_map(|media| {
242                let MediaObject {
243                    content,
244                    mut title,
245                    duration: object_duration,
246                    mut description,
247                    thumbnails,
248                    ..
249                } = media;
250                if let Some(text) = description.as_mut() {
251                    Self::sanitize_text(text, website);
252                }
253                if let Some(text) = title.as_mut() {
254                    Self::sanitize_text(text, website);
255                }
256
257                let description = description.map(|t| t.content);
258                let title = title.map(|t| t.content);
259                let thumbnail_url = thumbnails.first().map(|thumb| thumb.image.uri.clone());
260
261                let mut default_url = None;
262
263                content
264                    .into_iter()
265                    .filter_map(|content| {
266                        let MediaContent {
267                            url,
268                            content_type,
269                            duration: content_duration,
270                            width,
271                            height,
272                            size,
273                            ..
274                        } = content;
275                        url.map(|url| {
276                            let url = Url::new(url.clone());
277                            let duration = if content_duration.is_some() { content_duration } else { object_duration };
278                            let duration = duration.as_ref().map(Duration::as_secs).map(|secs| secs as i32);
279
280                            let enclosure = Enclosure {
281                                article_id: article_id.clone(),
282                                url: url.clone(),
283                                mime_type: content_type.as_ref().map(|mime| mime.to_string()),
284                                title: title.clone(),
285                                position: None,
286                                summary: description.clone(),
287                                thumbnail_url: thumbnail_url.clone(),
288                                filesize: size.map(|s| s as i32),
289                                width: width.map(|w| w as i32),
290                                height: height.map(|w| w as i32),
291                                duration,
292                                framerate: None,
293                                alternative: default_url.clone(),
294                                is_default: false,
295                            };
296
297                            if default_url.is_none() {
298                                default_url = Some(url);
299                            }
300
301                            enclosure
302                        })
303                    })
304                    .collect::<Vec<Enclosure>>()
305            })
306            .collect::<Vec<Enclosure>>();
307
308        if let Some(largest_video) = enclosures.iter_mut().max_by_key(|enclosure| enclosure.height) {
309            largest_video.is_default = true;
310        }
311
312        let date = match published {
313            Some(published) => published,
314            None => match updated {
315                Some(updated) => updated,
316                None => Utc::now(),
317            },
318        };
319
320        let updated = if let Some(updated) = updated
321            && updated > date
322        {
323            Some(updated)
324        } else {
325            None
326        };
327
328        let article = FatArticle {
329            article_id,
330            feed_id: feed_id.clone(),
331            title: title.map(|t| t.content),
332            url: article_url,
333            author: authors
334                .iter()
335                .filter_map(|person| if person.name.is_empty() { None } else { Some(person.name.clone()) })
336                .next(),
337            date,
338            synced: Utc::now(),
339            updated,
340            direction: None,
341            marked,
342            unread,
343            html,
344            scraped_content: None,
345            summary,
346            plain_text,
347            thumbnail_url,
348        };
349
350        Some((article, enclosures))
351    }
352
353    async fn download_and_parse_feed(
354        url: &Url,
355        etag: Option<&str>,
356        ignore_until: Option<DateTime<Utc>>,
357        client: &Client,
358        header: Option<HeaderMap<HeaderValue>>,
359    ) -> Result<Option<DownloadSuccess>, DownloadFailure> {
360        if ignore_until.map(|dt| dt > Utc::now()).unwrap_or(false) {
361            tracing::debug!(%url, ?ignore_until, "ignore download");
362            return Ok(None);
363        }
364
365        let mut request_builder = client
366            .get(url.as_str())
367            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/json"))
368            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/feed+json"))
369            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/xml"))
370            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/rss+xml"))
371            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/atom+xml"))
372            .header(reqwest::header::ACCEPT, HeaderValue::from_static("text/xml"))
373            // workaround for CDNs like Cloudfare
374            .header(reqwest::header::ACCEPT, HeaderValue::from_static("text/html"));
375
376        if let Some(header) = header {
377            request_builder = request_builder.headers(header)
378        }
379
380        let feed_response = match request_builder.send().await {
381            Ok(response) => response,
382            Err(error) => {
383                tracing::error!(%url, %error, "Downloading feed failed");
384                return Err(DownloadFailure {
385                    msg: format!("Downloading feed failed: {url} - {error}"),
386                    error: FeedApiError::Network(error),
387                    retry_after: None,
388                });
389            }
390        };
391
392        let response_status = feed_response.status();
393
394        if !response_status.is_success() {
395            if response_status == reqwest::StatusCode::NOT_MODIFIED {
396                return Ok(None);
397            }
398
399            tracing::error!(%url, %response_status, "Downloading feed failed");
400
401            let retry_after = if response_status == reqwest::StatusCode::TOO_MANY_REQUESTS {
402                feed_response
403                    .headers()
404                    .get(reqwest::header::RETRY_AFTER)
405                    .and_then(|hv| hv.to_str().ok())
406                    .and_then(|str| str.parse::<u64>().ok())
407            } else {
408                None
409            };
410
411            let error = if let Err(error) = feed_response.error_for_status() {
412                FeedApiError::Network(error)
413            } else {
414                FeedApiError::Unknown
415            };
416
417            return Err(DownloadFailure {
418                msg: format!("Downloading feed failed: {response_status} -  {url}"),
419                error,
420                retry_after,
421            });
422        }
423
424        let new_etag = feed_response
425            .headers()
426            .get(reqwest::header::ETAG)
427            .and_then(|hv| hv.to_str().ok())
428            .map(|etag| etag.to_string());
429
430        let cache_control = feed_response
431            .headers()
432            .get(reqwest::header::CACHE_CONTROL)
433            .and_then(|hv| hv.to_str().ok())
434            .and_then(cache_control::CacheControl::from_header);
435
436        if let (Some(old_etag), Some(new_etag)) = (etag, new_etag.as_deref())
437            && old_etag == new_etag
438        {
439            return Ok(None);
440        }
441
442        let content_location = feed_response
443            .headers()
444            .get(reqwest::header::CONTENT_LOCATION)
445            .and_then(|hv| hv.to_str().ok())
446            .and_then(|cl| Url::parse(cl).ok());
447
448        let result_bytes = match feed_response.bytes().await {
449            Ok(result_bytes) => result_bytes,
450            Err(error) => {
451                tracing::error!(%url, %error, "Reading response as string failed");
452                return Err(DownloadFailure {
453                    msg: format!("Reading response as string failed: {url} - {error}"),
454                    error: FeedApiError::Network(error),
455                    retry_after: None,
456                });
457            }
458        };
459
460        let feed_base_url = if let Some(content_location) = content_location {
461            content_location
462        } else {
463            url.clone()
464        };
465
466        let parser = ParserBuilder::new().base_uri(Some(feed_base_url)).sanitize_content(false).build();
467        let parse_result = parser.parse(result_bytes.as_ref());
468
469        let parsed_feed = if let Ok(parsed_feed) = parse_result {
470            parsed_feed
471        } else {
472            tracing::error!(%url, "Couldn't parse feed content");
473            return Err(DownloadFailure {
474                msg: format!("Couldn't parse feed content: {url}"),
475                error: FeedApiError::ParseFeed(FeedParserError::Feed),
476                retry_after: None,
477            });
478        };
479
480        Ok(Some(DownloadSuccess {
481            feed: parsed_feed,
482            etag: new_etag,
483            cache_control,
484        }))
485    }
486
487    fn feed_get_host_url(feed: &Feed) -> Option<(FeedID, String)> {
488        let url = feed.feed_url.as_ref()?;
489        let host = url.host_str()?;
490        let host = if let Some(host) = host.strip_prefix("www.") {
491            host
492        } else if let Some(host) = host.strip_prefix("www1.") {
493            host
494        } else if let Some(host) = host.strip_prefix("www2.") {
495            host
496        } else {
497            host
498        };
499        Some((feed.feed_id.clone(), host.to_string()))
500    }
501}
502
503#[async_trait]
504impl FeedApi for LocalRSS {
505    fn features(&self) -> FeedApiResult<PluginCapabilities> {
506        Ok(PluginCapabilities::ADD_REMOVE_FEEDS
507            | PluginCapabilities::SUPPORT_CATEGORIES
508            | PluginCapabilities::SUPPORT_SUBCATEGORIES
509            | PluginCapabilities::MODIFY_CATEGORIES
510            | PluginCapabilities::SUPPORT_TAGS
511            | PluginCapabilities::EDIT_FEED_URLS)
512    }
513
514    fn has_user_configured(&self) -> FeedApiResult<bool> {
515        Ok(true)
516    }
517
518    async fn is_reachable(&self, _client: &Client) -> FeedApiResult<bool> {
519        Err(FeedApiError::Unsupported)
520    }
521
522    async fn is_logged_in(&self, _client: &Client) -> FeedApiResult<bool> {
523        Ok(true)
524    }
525
526    async fn user_name(&self) -> Option<String> {
527        None
528    }
529
530    async fn get_login_data(&self) -> Option<LoginData> {
531        Some(LoginData::None(LocalMetadata::get_id()))
532    }
533
534    async fn login(&mut self, _data: LoginData, _client: &Client) -> FeedApiResult<()> {
535        Ok(())
536    }
537
538    async fn logout(&mut self, _client: &Client) -> FeedApiResult<()> {
539        self.cache.read().await.delete()?;
540        Ok(())
541    }
542
543    async fn initial_sync(&self, client: &Client, custom_header: FeedHeaderMap) -> FeedApiResult<SyncResult> {
544        let feeds = self.portal.get_feeds()?;
545        let semaphore = self.portal.get_download_semaphore();
546
547        let feed_hash_map = Arc::new(RwLock::new(
548            feeds
549                .clone()
550                .into_iter()
551                .map(|feed| (feed.feed_id.clone(), feed))
552                .collect::<HashMap<FeedID, Feed>>(),
553        ));
554
555        let mut delays: HashMap<FeedID, Duration> = HashMap::new();
556        let mut host_counts: HashMap<String, u64> = HashMap::new();
557        for (i, (feed_id, host)) in feeds.iter().filter_map(Self::feed_get_host_url).enumerate() {
558            let host_is_blacklisted = DELAY_HOST_BLACKLIST.contains(host.as_str());
559            let count = host_counts.entry(host).or_default();
560            *count += 1;
561
562            let mut delay_ms = i as u64 * GENERAL_DELAY_MS;
563
564            if *count > 1 && !host_is_blacklisted {
565                let delay = *count - 1;
566                delay_ms += delay * SAME_HOST_DELAY_MS;
567                tracing::debug!(%feed_id, %delay_ms, "delaying request");
568            }
569
570            delays.insert(feed_id, Duration::from_millis(delay_ms));
571        }
572
573        let mut task_handles = Vec::new();
574
575        for feed in feeds.into_iter() {
576            let client = client.clone();
577            let portal = self.portal.clone();
578            let hash_map = feed_hash_map.clone();
579            let semaphore = Arc::clone(&semaphore);
580            let cache = self.cache.clone();
581            let header = custom_header.get(&feed.feed_id).cloned();
582            let delay = delays.get(&feed.feed_id).copied();
583
584            task_handles.push(tokio::spawn(async move {
585                let Some(url) = feed.feed_url.clone() else {
586                    tracing::warn!(%feed.feed_id, "No feed url for feed");
587                    if let Some(old_feed) = hash_map.write().await.get_mut(&feed.feed_id) {
588                        old_feed.error_count += 1;
589                        old_feed.error_message = Some(format!("No feed url for feed: '{}'", feed.feed_id));
590                    }
591                    return Err(FeedApiError::ParseFeed(FeedParserError::NoUrl));
592                };
593
594                if let Some(delay) = delay {
595                    tokio::time::sleep(delay).await;
596                }
597
598                // acquire permit first to make sure only a set number of feeds can download concurrently
599                let permit = match semaphore.acquire().await {
600                    Ok(permit) => permit,
601                    Err(error) => {
602                        tracing::error!(%url, %error, "couldn't acquire download permit for feed");
603                        return Err(error.into());
604                    }
605                };
606
607                let old_etag = cache.read().await.get_etag(&feed.feed_id);
608                let ignore_until = cache.read().await.get_cache_control(&feed.feed_id);
609                let parsed_feed = match Self::download_and_parse_feed(&url, old_etag.as_deref(), ignore_until, &client, header).await {
610                    Ok(Some(DownloadSuccess {
611                        feed: parsed_feed,
612                        etag,
613                        cache_control,
614                    })) => {
615                        cache.write().await.set_etag(&feed.feed_id, etag);
616                        cache
617                            .write()
618                            .await
619                            .set_cache_control(&feed.feed_id, cache_control.and_then(|cc| cc.max_age).map(|max_age| max_age.as_secs()));
620                        parsed_feed
621                    }
622                    Ok(None) => {
623                        tracing::debug!(%feed.feed_id, "skipped downloading");
624                        if let Some(old_feed) = hash_map.write().await.get_mut(&feed.feed_id) {
625                            old_feed.error_count = 0;
626                            old_feed.error_message = None;
627                        }
628                        return Ok((Vec::new(), Vec::new()));
629                    }
630                    Err(DownloadFailure { msg, error, retry_after }) => {
631                        cache.write().await.set_cache_control(&feed.feed_id, retry_after);
632
633                        if let Some(old_feed) = hash_map.write().await.get_mut(&feed.feed_id) {
634                            old_feed.error_count += 1;
635                            old_feed.error_message = Some(msg);
636                        }
637
638                        cache.write().await.set_etag(&feed.feed_id, None);
639
640                        return Err(error);
641                    }
642                };
643
644                // deliberatly drop permit here to allow other feeds to download now
645                drop(permit);
646
647                // update icon url & website
648                let updated_feed = Feed::from_feed_rs(&parsed_feed, None, &url);
649                if let Some(old_feed) = hash_map.write().await.get_mut(&feed.feed_id) {
650                    old_feed.icon_url = updated_feed.icon_url;
651                    old_feed.website = updated_feed.website;
652                    old_feed.error_count = 0;
653                    old_feed.error_message = None;
654                }
655
656                let conversion_result = parsed_feed
657                    .entries
658                    .into_iter()
659                    .rev()
660                    .filter_map(|e| Self::convert_entry(e, &feed.feed_id, feed.website.as_ref(), portal.clone()))
661                    .collect::<Vec<_>>();
662
663                let (articles, enclosures): (Vec<FatArticle>, Vec<Vec<Enclosure>>) = conversion_result.into_iter().unzip();
664                let enclosures = enclosures.into_iter().flatten().collect();
665
666                Ok((articles, enclosures))
667            }));
668        }
669
670        let mut articles: Vec<FatArticle> = Vec::new();
671        let mut enclosures: Vec<Enclosure> = Vec::new();
672
673        let result_vec = futures::future::join_all(task_handles).await;
674        _ = self.cache.write().await.write();
675
676        for (mut feed_articles, mut feed_enclosures) in result_vec.into_iter().flatten().flatten() {
677            articles.append(&mut feed_articles);
678            enclosures.append(&mut feed_enclosures);
679        }
680
681        let feeds = Arc::into_inner(feed_hash_map)
682            .map(|lock| lock.into_inner())
683            .map(|map| map.into_values().collect())
684            .and_then(util::vec_to_option);
685
686        Ok(SyncResult {
687            feeds,
688            categories: None,
689            feed_mappings: None,
690            category_mappings: None,
691            tags: None,
692            headlines: None,
693            articles: util::vec_to_option(articles),
694            enclosures: util::vec_to_option(enclosures),
695            taggings: None,
696        })
697    }
698
699    async fn sync(&self, client: &Client, custom_header: FeedHeaderMap) -> FeedApiResult<SyncResult> {
700        self.initial_sync(client, custom_header).await
701    }
702
703    async fn fetch_feed(&self, feed_id: &FeedID, client: &Client, custom_header: HeaderMap<HeaderValue>) -> FeedApiResult<FeedUpdateResult> {
704        let feeds = self.portal.get_feeds()?;
705        let mut feed = feeds.into_iter().find(|f| &f.feed_id == feed_id).ok_or(FeedApiError::Unknown)?;
706
707        let url = feed.feed_url.clone().ok_or(FeedApiError::Unknown)?;
708
709        let parsed_feed = match Self::download_and_parse_feed(&url, None, None, client, Some(custom_header)).await {
710            Ok(Some(DownloadSuccess {
711                feed: parsed_feed,
712                etag,
713                cache_control,
714            })) => {
715                self.cache.write().await.set_etag(&feed.feed_id, etag);
716                self.cache
717                    .write()
718                    .await
719                    .set_cache_control(&feed.feed_id, cache_control.and_then(|cc| cc.max_age).map(|max_age| max_age.as_secs()));
720                parsed_feed
721            }
722            Ok(None) => {
723                return Ok(FeedUpdateResult {
724                    feed: None,
725                    articles: None,
726                    enclosures: None,
727                    taggings: None,
728                });
729            }
730            Err(DownloadFailure { msg: _, error, retry_after }) => {
731                self.cache.write().await.set_cache_control(&feed.feed_id, retry_after);
732                return Err(error);
733            }
734        };
735
736        _ = self.cache.write().await.write();
737
738        // update icon url & website
739        let updated_feed = Feed::from_feed_rs(&parsed_feed, None, &url);
740        feed.icon_url = updated_feed.icon_url;
741        feed.website = updated_feed.website;
742        feed.error_count = 0;
743        feed.error_message = None;
744
745        let conversion_result = parsed_feed
746            .entries
747            .into_iter()
748            .rev()
749            .map(|e| Self::convert_entry(e, &feed.feed_id, feed.website.as_ref(), self.portal.clone()))
750            .collect::<Vec<_>>();
751
752        let (articles, enclosures): (Vec<FatArticle>, Vec<Vec<Enclosure>>) = conversion_result.into_iter().flatten().unzip();
753        let enclosures = enclosures.into_iter().flatten().collect();
754
755        Ok(FeedUpdateResult {
756            feed: Some(feed),
757            articles: util::vec_to_option(articles),
758            enclosures: util::vec_to_option(enclosures),
759            taggings: None,
760        })
761    }
762
763    async fn set_article_read(&self, _articles: &[ArticleID], _read: models::Read, _client: &Client) -> FeedApiResult<()> {
764        Ok(())
765    }
766
767    async fn set_article_marked(&self, _articles: &[ArticleID], _marked: models::Marked, _client: &Client) -> FeedApiResult<()> {
768        Ok(())
769    }
770
771    async fn set_feed_read(&self, _feeds: &[FeedID], _articles: &[ArticleID], _client: &Client) -> FeedApiResult<()> {
772        Ok(())
773    }
774
775    async fn set_category_read(&self, _categories: &[CategoryID], _articles: &[ArticleID], _client: &Client) -> FeedApiResult<()> {
776        Ok(())
777    }
778
779    async fn set_tag_read(&self, _tags: &[TagID], _articles: &[ArticleID], _client: &Client) -> FeedApiResult<()> {
780        Ok(())
781    }
782
783    async fn set_all_read(&self, _articles: &[ArticleID], _client: &Client) -> FeedApiResult<()> {
784        Ok(())
785    }
786
787    async fn add_feed(
788        &self,
789        url: &Url,
790        title: Option<String>,
791        category_id: Option<CategoryID>,
792        client: &Client,
793    ) -> FeedApiResult<(Feed, Option<Category>)> {
794        let feed_response = client
795            .get(url.as_str())
796            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/json"))
797            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/feed+json"))
798            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/xml"))
799            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/rss+xml"))
800            .header(reqwest::header::ACCEPT, HeaderValue::from_static("application/atom+xml"))
801            .header(reqwest::header::ACCEPT, HeaderValue::from_static("text/xml"))
802            // workaround for CDNs like Cloudfare
803            .header(reqwest::header::ACCEPT, HeaderValue::from_static("text/html"))
804            .send()
805            .await?
806            .error_for_status()?;
807
808        let result_bytes = feed_response
809            .bytes()
810            .await
811            .inspect_err(|error| tracing::error!(%url, %error, "Reading response as bytes failed"))?;
812
813        let parser = ParserBuilder::new().base_uri(Some(url)).build();
814        let feed = parser.parse(result_bytes.as_ref())?;
815        let feed = Feed::from_feed_rs(&feed, title, url);
816
817        let categories = self.portal.get_categories()?;
818        let category = categories.iter().find(|c| Some(&c.category_id) == category_id.as_ref()).cloned();
819
820        Ok((feed, category))
821    }
822
823    async fn remove_feed(&self, id: &FeedID, _client: &Client) -> FeedApiResult<()> {
824        let mut cache = self.cache.write().await;
825        cache.set_etag(id, None);
826        cache.set_cache_control(id, None);
827        _ = cache.write();
828        Ok(())
829    }
830
831    async fn move_feed(&self, _feed_id: &FeedID, _from: &CategoryID, _to: &CategoryID, _client: &Client) -> FeedApiResult<()> {
832        Ok(())
833    }
834
835    async fn rename_feed(&self, feed_id: &FeedID, _new_title: &str, _client: &Client) -> FeedApiResult<FeedID> {
836        Ok(feed_id.clone())
837    }
838
839    async fn edit_feed_url(&self, _feed_id: &FeedID, _new_url: &str, _client: &Client) -> FeedApiResult<()> {
840        Ok(())
841    }
842
843    async fn add_category<'a>(&self, _title: &str, _parent: Option<&'a CategoryID>, _client: &Client) -> FeedApiResult<CategoryID> {
844        let uuid = uuid::Uuid::new_v4();
845        Ok(CategoryID::from_owned(uuid.to_string()))
846    }
847
848    async fn remove_category(&self, id: &CategoryID, _remove_children: bool, _client: &Client) -> FeedApiResult<()> {
849        let mappings = self.portal.get_feed_mappings()?;
850        let mut cache = self.cache.write().await;
851        for mapping in mappings {
852            if &mapping.category_id == id {
853                cache.set_etag(&mapping.feed_id, None);
854                cache.set_cache_control(&mapping.feed_id, None);
855            }
856        }
857
858        _ = cache.write();
859
860        Ok(())
861    }
862
863    async fn rename_category(&self, id: &CategoryID, _new_title: &str, _client: &Client) -> FeedApiResult<CategoryID> {
864        Ok(id.clone())
865    }
866
867    async fn move_category(&self, _id: &CategoryID, _parent: &CategoryID, _client: &Client) -> FeedApiResult<()> {
868        Ok(())
869    }
870
871    async fn import_opml(&self, _opml: &str, _client: &Client) -> FeedApiResult<()> {
872        Ok(())
873    }
874
875    async fn add_tag(&self, title: &str, _client: &Client) -> FeedApiResult<TagID> {
876        Ok(TagID::new(title))
877    }
878
879    async fn remove_tag(&self, _id: &TagID, _client: &Client) -> FeedApiResult<()> {
880        Ok(())
881    }
882
883    async fn rename_tag(&self, id: &TagID, _new_title: &str, _client: &Client) -> FeedApiResult<TagID> {
884        Ok(id.clone())
885    }
886
887    async fn tag_article(&self, _article_id: &ArticleID, _tag_id: &TagID, _client: &Client) -> FeedApiResult<()> {
888        Ok(())
889    }
890
891    async fn untag_article(&self, _article_id: &ArticleID, _tag_id: &TagID, _client: &Client) -> FeedApiResult<()> {
892        Ok(())
893    }
894
895    async fn get_favicon(&self, feed_id: &FeedID, client: &Client, custom_header: HeaderMap<HeaderValue>) -> FeedApiResult<FavIcon> {
896        let Some(feed) = self.portal.get_feeds()?.into_iter().find(|f| &f.feed_id == feed_id) else {
897            return Err(FeedApiError::Unknown);
898        };
899
900        if let Some(feed_url) = feed.feed_url.as_ref() {
901            let old_etag = self.cache.read().await.get_etag(feed_id);
902            let ignore_until = self.cache.read().await.get_cache_control(feed_id);
903            let feed = Self::download_and_parse_feed(feed_url, old_etag.as_deref(), ignore_until, client, Some(custom_header))
904                .await
905                .map_err(
906                    |DownloadFailure {
907                         msg: _,
908                         error,
909                         retry_after: _,
910                     }| error,
911                )?
912                .map(
913                    |DownloadSuccess {
914                         feed: feed_rs,
915                         etag: _,
916                         cache_control: _,
917                     }| Feed::from_feed_rs(&feed_rs, None, feed_url),
918                )
919                .unwrap_or(feed);
920            _ = self.cache.write().await.write();
921
922            Ok(FavIcon {
923                feed_id: feed_id.clone(),
924                expires: Utc::now() + TimeDelta::try_days(EXPIRES_AFTER_DAYS).unwrap(),
925                format: None,
926                etag: None,
927                source_url: feed.icon_url,
928                data: None,
929            })
930        } else if let Some(icon_url) = feed.icon_url {
931            Ok(FavIcon {
932                feed_id: feed_id.clone(),
933                expires: Utc::now() + TimeDelta::try_days(EXPIRES_AFTER_DAYS).unwrap(),
934                format: None,
935                etag: None,
936                source_url: Some(icon_url),
937                data: None,
938            })
939        } else {
940            Err(FeedApiError::Unknown)
941        }
942    }
943}
944
945#[cfg(test)]
946mod tests {
947    use crate::models::Url;
948    use test_log::test;
949
950    #[test]
951    fn clean_html_entity() {
952        let text = "Tired of Google&#x27;s Tracking? Motorola&#x27;s GrapheneOS-Powered Phones Are Coming";
953        let is_html = super::HTML_ENTITY_REGEX.is_match(text);
954        assert!(is_html);
955        let res = super::LocalRSS::sanitize_string(text, None);
956        assert_eq!(res, "Tired of Google's Tracking? Motorola's GrapheneOS-Powered Phones Are Coming");
957    }
958
959    #[test]
960    fn clean_twig233() {
961        let html = r#"
962            <blockquote>
963                <p>Exhibit gets animated!</p>
964                <p>The latest release adds animation playback and armature visualization, making it easier to preview rigged 3D models directly in GNOME. All thanks to F3D’s latest improvements.</p>
965                <p>Get it on <a href="https://flathub.org/apps/io.github.nokse22.Exhibit">Flathub</a></p>
966                <p>Checkout <a href="https://f3d.app">F3D</a></p>
967                <p><img  width="924" height="611" loading="lazy" decoding="async" src="/_astro/exhibit.yGW-D87q_oC986.webp" ></p>
968                <p></p>
969                <p><video controls><source src="/posts/2026/01/twig-233/exhibit.mp4" type="video/mp4"></video></p>
970            </blockquote>
971        "#;
972
973        let url = Url::parse("https://thisweek.gnome.org/").unwrap();
974        let res = super::LocalRSS::sanitize_string(html, Some(&url));
975        assert!(res.contains("<video><source src="));
976    }
977
978    #[test]
979    fn empty_table_cell() {
980        let html = r#"
981            <table>
982             <tr>
983               <th>Company</th>
984               <th></th>
985               <th>Country</th>
986             </tr>
987             <tr>
988               <td>Alfreds Futterkiste</td>
989               <td>Maria Anders</td>
990               <td>Germany</td>
991             </tr>
992             <tr>
993               <td>Centro comercial Moctezuma</td>
994               <td>Francisco Chang</td>
995               <td>Mexico</td>
996             </tr>
997           </table> 
998        "#;
999
1000        let res = super::LocalRSS::sanitize_string(html, None);
1001        assert!(res.contains("<th></th>"));
1002    }
1003}