pub mod css_extractor;
pub mod github;
pub mod google;
pub mod hackernews;
pub mod linkedin;
pub mod reddit;
pub mod rules;
pub mod wasm_manifest;
#[cfg(feature = "wasm-providers")]
pub mod wasm_provider;
use anyhow::Result;
use async_trait::async_trait;
use crate::http_client::AcceleratedClient;
#[derive(Debug, Clone, Default)]
pub struct Engagement {
pub likes: Option<u64>,
pub reposts: Option<u64>,
pub replies: Option<u64>,
pub views: Option<u64>,
}
#[derive(Debug, Clone)]
pub struct SiteMetadata {
pub author: Option<String>,
pub title: Option<String>,
pub published: Option<String>,
pub platform: String,
pub canonical_url: String,
pub media_urls: Vec<String>,
pub engagement: Option<Engagement>,
}
#[must_use]
#[allow(clippy::cast_precision_loss)]
pub fn format_number_compact(n: u64) -> String {
if n >= 1_000_000 {
format!("{:.1}M", n as f64 / 1_000_000.0)
} else if n >= 1_000 {
format!("{:.1}K", n as f64 / 1_000.0)
} else {
n.to_string()
}
}
#[derive(Debug, Clone)]
pub struct SiteContent {
pub markdown: String,
pub metadata: SiteMetadata,
}
#[async_trait]
pub trait SiteProvider: Send + Sync {
fn name(&self) -> &'static str;
fn matches(&self, url: &str) -> bool;
async fn extract(
&self,
url: &str,
client: &AcceleratedClient,
cookies: Option<&str>,
prefetched_html: Option<&[u8]>,
) -> Result<SiteContent>;
}
pub struct SiteRouter {
providers: Vec<Box<dyn SiteProvider>>,
}
impl SiteRouter {
#[must_use]
pub fn new() -> Self {
let mut providers: Vec<Box<dyn SiteProvider>> = rules::load_site_rules();
let rule_names = rules::rule_overridden_names();
let hardcoded: Vec<Box<dyn SiteProvider>> = vec![
Box::new(hackernews::HackerNewsProvider),
Box::new(github::GitHubProvider),
Box::new(google::GoogleWorkspaceProvider),
Box::new(linkedin::LinkedInProvider),
];
for p in hardcoded {
if !rule_names.contains(p.name()) {
providers.push(p);
}
}
append_css_providers(&mut providers);
#[cfg(feature = "wasm-providers")]
append_wasm_providers(&mut providers);
Self { providers }
}
#[must_use]
pub fn with_extra_providers(mut extra: Vec<Box<dyn SiteProvider>>) -> Self {
let mut router = Self::new();
router.providers.append(&mut extra);
router
}
#[must_use]
pub fn provider_count(&self) -> usize {
self.providers.len()
}
pub async fn try_extract(
&self,
url: &str,
client: &AcceleratedClient,
cookies: Option<&str>,
) -> Option<SiteContent> {
self.try_extract_with_html(url, client, cookies, None).await
}
pub async fn try_extract_with_html(
&self,
url: &str,
client: &AcceleratedClient,
cookies: Option<&str>,
prefetched_html: Option<&[u8]>,
) -> Option<SiteContent> {
for provider in &self.providers {
if provider.matches(url) {
tracing::debug!("Matched site provider: {}", provider.name());
match provider
.extract(url, client, cookies, prefetched_html)
.await
{
Ok(content) => return Some(content),
Err(e) => {
tracing::warn!(
"Site provider {} failed for {}: {}",
provider.name(),
url,
e
);
return None;
}
}
}
}
None
}
}
impl Default for SiteRouter {
fn default() -> Self {
Self::new()
}
}
fn append_css_providers(providers: &mut Vec<Box<dyn SiteProvider>>) {
use crate::plugin::config::load_all_plugins;
use css_extractor::{CssExtractorConfig, CssExtractorProvider};
let loaded = match load_all_plugins() {
Ok(l) => l,
Err(e) => {
tracing::warn!("Failed to load plugins.toml: {e}");
return;
}
};
for css_cfg in loaded.css {
let url_pattern = build_pattern_regex(&css_cfg.patterns);
let config = CssExtractorConfig {
name: css_cfg.name.clone(),
url_pattern,
content_selector: css_cfg.content.selector,
title_selector: css_cfg.metadata.title,
author_selector: css_cfg.metadata.author,
date_selector: css_cfg.metadata.published,
remove_selectors: css_cfg.content.remove,
};
match CssExtractorProvider::new(config) {
Ok(provider) => {
tracing::debug!("Loaded CSS extractor plugin: {}", css_cfg.name);
providers.push(Box::new(provider));
}
Err(e) => {
tracing::warn!("CSS extractor '{}' failed to load: {e}", css_cfg.name);
}
}
}
}
#[cfg(feature = "wasm-providers")]
fn append_wasm_providers(providers: &mut Vec<Box<dyn SiteProvider>>) {
use wasm_manifest::{load_installed_providers, wasm_providers_dir};
use wasm_provider::load_provider_from_file;
let base = wasm_providers_dir();
let installed = load_installed_providers(&base);
for p in installed {
let url_pattern = build_pattern_regex(&p.manifest.url_patterns);
match load_provider_from_file(&p.manifest.name, &p.wasm_path, &url_pattern) {
Ok(provider) => {
tracing::debug!("Loaded WASM provider: {}", p.manifest.name);
providers.push(provider);
}
Err(e) => {
tracing::warn!("WASM provider '{}' failed to load: {e}", p.manifest.name);
}
}
}
}
fn build_pattern_regex(patterns: &[String]) -> String {
if patterns.is_empty() {
return r"\A\z".to_string();
}
if patterns.len() == 1 {
return patterns[0].clone();
}
patterns
.iter()
.map(|p| format!("(?:{p})"))
.collect::<Vec<_>>()
.join("|")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn router_registers_all_builtin_providers() {
let router = SiteRouter::new();
assert!(router.providers.len() >= 13);
let names: Vec<&str> = router.providers.iter().map(|p| p.name()).collect();
for expected in &[
"twitter",
"reddit",
"hackernews",
"hackernews-item",
"github",
"github-issues",
"google-workspace",
"instagram",
"youtube",
"wikipedia",
"stackoverflow",
"mastodon",
"linkedin",
] {
assert!(names.contains(expected), "missing provider '{expected}'");
}
}
#[test]
fn router_rule_providers_come_before_hardcoded() {
let router = SiteRouter::new();
let twitter_pos = router.providers.iter().position(|p| p.name() == "twitter");
let hn_pos = router
.providers
.iter()
.position(|p| p.name() == "hackernews");
assert!(
twitter_pos < hn_pos,
"rule-based twitter should precede hardcoded hackernews"
);
}
#[test]
fn router_matches_twitter_urls() {
let router = SiteRouter::new();
let twitter = router
.providers
.iter()
.find(|p| p.matches("https://x.com/user/status/123"))
.expect("some provider should match twitter URLs");
assert_eq!(twitter.name(), "twitter");
assert!(twitter.matches("https://twitter.com/user/status/456"));
}
#[test]
fn router_does_not_match_non_provider_urls() {
let router = SiteRouter::new();
let generic_url = "https://example.com/page";
for provider in &router.providers {
assert!(
!provider.matches(generic_url),
"provider '{}' should not match generic URL",
provider.name()
);
}
}
#[test]
fn router_with_extra_provider_increases_count() {
use css_extractor::{CssExtractorConfig, CssExtractorProvider};
let base_count = SiteRouter::new().provider_count();
let config = CssExtractorConfig {
name: "extra".to_string(),
url_pattern: r"extra\.example\.com".to_string(),
content_selector: "main".to_string(),
title_selector: None,
author_selector: None,
date_selector: None,
remove_selectors: vec![],
};
let provider = CssExtractorProvider::new(config).unwrap();
let router = SiteRouter::with_extra_providers(vec![Box::new(provider)]);
assert_eq!(router.provider_count(), base_count + 1);
}
#[test]
fn extra_css_provider_matches_its_url() {
use css_extractor::{CssExtractorConfig, CssExtractorProvider};
let config = CssExtractorConfig {
name: "my-extra".to_string(),
url_pattern: r"myextra\.com".to_string(),
content_selector: "article".to_string(),
title_selector: None,
author_selector: None,
date_selector: None,
remove_selectors: vec![],
};
let provider = CssExtractorProvider::new(config).unwrap();
let router = SiteRouter::with_extra_providers(vec![Box::new(provider)]);
let base_count = SiteRouter::new().provider_count();
for p in router.providers.iter().take(base_count) {
assert!(!p.matches("https://myextra.com/article/1"));
}
let last = router.providers.last().unwrap();
assert!(last.matches("https://myextra.com/article/1"));
}
#[test]
fn build_pattern_regex_empty_never_matches() {
let pattern = build_pattern_regex(&[]);
let re = regex::Regex::new(&pattern).unwrap();
assert!(!re.is_match("anything"));
}
#[test]
fn build_pattern_regex_single_pattern_unchanged() {
let pattern = build_pattern_regex(&[r"foo\.com".to_string()]);
assert_eq!(pattern, r"foo\.com");
}
#[test]
fn build_pattern_regex_multiple_patterns_alternate() {
let pattern = build_pattern_regex(&[r"foo\.com".to_string(), r"bar\.com".to_string()]);
let re = regex::Regex::new(&pattern).unwrap();
assert!(re.is_match("https://foo.com/page"));
assert!(re.is_match("https://bar.com/page"));
assert!(!re.is_match("https://baz.com/page"));
}
}