use std::collections::HashMap;
use std::process::Command;
use std::time::Duration;
use once_cell::sync::Lazy;
use ureq::ResponseExt;
use crate::config::Config;
use crate::error::{KtoError, Result};
use crate::watch::Engine;
const DEFAULT_TIMEOUT_SECS: u64 = 30;
const DEFAULT_PLAYWRIGHT_TIMEOUT_MS: u64 = 45000;
static HTTP_AGENT: Lazy<ureq::Agent> = Lazy::new(|| {
ureq::Agent::config_builder()
.timeout_global(Some(Duration::from_secs(DEFAULT_TIMEOUT_SECS)))
.build()
.into()
});
static HTML_TAG_RE: Lazy<regex::Regex> = Lazy::new(|| {
regex::Regex::new(r"<[^>]+>").expect("Invalid HTML tag regex")
});
static WHITESPACE_RE: Lazy<regex::Regex> = Lazy::new(|| {
regex::Regex::new(r"\s+").expect("Invalid whitespace regex")
});
#[derive(Debug, Clone)]
pub struct PageContent {
pub url: String,
pub title: Option<String>,
pub html: String,
pub text: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ProbeResult {
pub suggested_engine: Engine,
pub rss_url: Option<String>,
pub has_jsonld: bool,
pub jsonld_type: Option<String>,
pub content_length: usize,
pub has_bot_protection: bool,
pub message: Option<String>,
}
pub fn fetch(url: &str, engine: Engine, headers: &HashMap<String, String>) -> Result<PageContent> {
match engine {
Engine::Http => fetch_http(url, headers),
Engine::Playwright => fetch_playwright(url),
Engine::Rss => fetch_rss(url, headers),
Engine::Shell { ref command } => fetch_shell(command),
}
}
fn fetch_shell(command: &str) -> Result<PageContent> {
let output = Command::new("sh")
.args(["-c", command])
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(KtoError::ConfigError(format!(
"Shell command failed: {}",
stderr.trim()
)));
}
let text = String::from_utf8_lossy(&output.stdout).to_string();
Ok(PageContent {
url: format!("shell://{}", command),
title: Some(format!("Shell: {}", truncate_command(command, 50))),
html: text.clone(),
text: Some(text),
})
}
fn truncate_command(cmd: &str, max_len: usize) -> String {
if cmd.len() <= max_len {
cmd.to_string()
} else {
format!("{}...", &cmd[..max_len.saturating_sub(3)])
}
}
fn fetch_http(url: &str, headers: &HashMap<String, String>) -> Result<PageContent> {
let mut request = HTTP_AGENT.get(url);
for (key, value) in headers {
request = request.header(key, value);
}
request = request.header(
"User-Agent",
"Mozilla/5.0 (compatible; kto/0.1; +https://github.com/devinbernosky/kto)",
);
let response = request.call()?;
let final_url = response.get_uri().to_string();
let html = response.into_body().read_to_string()?;
Ok(PageContent {
url: final_url,
title: None, html,
text: None, })
}
fn fetch_playwright(url: &str) -> Result<PageContent> {
let data_dir = Config::data_dir()?;
let script_path = get_render_script_path()?;
if !script_path.exists() {
ensure_render_script()?;
}
let output = Command::new("node")
.arg(&script_path)
.arg(url)
.arg(DEFAULT_PLAYWRIGHT_TIMEOUT_MS.to_string()) .current_dir(&data_dir)
.output()?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
if let Ok(err) = serde_json::from_str::<serde_json::Value>(&stderr) {
let msg = err["error"].as_str().unwrap_or("unknown error");
return Err(KtoError::PlaywrightError(msg.to_string()));
}
return Err(KtoError::PlaywrightError(stderr.to_string()));
}
let stdout = String::from_utf8_lossy(&output.stdout);
let result: serde_json::Value = serde_json::from_str(&stdout)?;
Ok(PageContent {
url: result["url"].as_str().unwrap_or(url).to_string(),
title: result["title"].as_str().map(String::from),
html: result["html"].as_str().unwrap_or("").to_string(),
text: result["text"].as_str().map(String::from),
})
}
fn get_render_script_path() -> Result<std::path::PathBuf> {
let data_dir = Config::data_dir()?;
Ok(data_dir.join("render.mjs"))
}
pub fn ensure_render_script() -> Result<()> {
let script_path = get_render_script_path()?;
if let Some(parent) = script_path.parent() {
std::fs::create_dir_all(parent)?;
}
let script_content = include_str!("../assets/render.mjs");
std::fs::write(&script_path, script_content)?;
Ok(())
}
pub fn check_playwright() -> PlaywrightStatus {
let node_available = Command::new("node")
.arg("--version")
.output()
.map(|o| o.status.success())
.unwrap_or(false);
if !node_available {
return PlaywrightStatus::NodeMissing;
}
let playwright_available = Command::new("npx")
.args(["playwright", "--version"])
.output()
.map(|o| o.status.success())
.unwrap_or(false);
if !playwright_available {
return PlaywrightStatus::PlaywrightMissing;
}
let browser_paths = get_browser_paths();
for path in browser_paths {
if std::path::Path::new(&path).exists() {
return PlaywrightStatus::Ready;
}
}
PlaywrightStatus::BrowserMissing
}
fn get_browser_paths() -> Vec<String> {
let home = std::env::var("HOME").unwrap_or_default();
vec![
format!("{}/.cache/ms-playwright", home),
format!("{}/Library/Caches/ms-playwright", home),
]
}
#[derive(Debug, Clone, PartialEq)]
pub enum PlaywrightStatus {
Ready,
NodeMissing,
PlaywrightMissing,
BrowserMissing,
}
impl PlaywrightStatus {
pub fn is_ready(&self) -> bool {
matches!(self, PlaywrightStatus::Ready)
}
pub fn install_instructions(&self) -> &'static str {
match self {
PlaywrightStatus::Ready => "Playwright is ready",
PlaywrightStatus::NodeMissing => "Install Node.js: https://nodejs.org/",
PlaywrightStatus::PlaywrightMissing => "Run: npm install -g playwright",
PlaywrightStatus::BrowserMissing => "Run: npx playwright install chromium",
}
}
}
pub fn decide_engine(content: &PageContent, extraction_empty: bool) -> Engine {
if extraction_empty || content.html.len() < 100 {
return Engine::Playwright;
}
let lower_html = content.html.to_lowercase();
if lower_html.contains("cloudflare")
|| lower_html.contains("captcha")
|| lower_html.contains("please enable javascript")
|| lower_html.contains("browser check")
{
return Engine::Playwright;
}
Engine::Http
}
fn fetch_rss(url: &str, headers: &HashMap<String, String>) -> Result<PageContent> {
use feed_rs::parser;
use std::collections::BTreeMap;
let mut request = HTTP_AGENT.get(url);
for (key, value) in headers {
request = request.header(key, value);
}
request = request.header(
"User-Agent",
"Mozilla/5.0 (compatible; kto/0.1; +https://github.com/devinbernosky/kto)",
);
let response = request.call()?;
let final_url = response.get_uri().to_string();
let xml = response.into_body().read_to_string()?;
let feed = parser::parse(xml.as_bytes())
.map_err(|e| KtoError::FeedParseError(format!("Failed to parse feed: {}", e)))?;
let mut items_by_key: BTreeMap<String, String> = BTreeMap::new();
for entry in &feed.entries {
let guid = entry.id.clone();
let title = entry
.title
.as_ref()
.map(|t| t.content.clone())
.unwrap_or_else(|| "(no title)".to_string());
let link = entry.links.first().map(|l| l.href.clone());
let published = entry
.published
.or(entry.updated)
.map(|dt| dt.to_rfc3339());
let summary = entry
.summary
.as_ref()
.map(|s| truncate_text(&strip_html_tags(&s.content), 500))
.or_else(|| {
entry
.content
.as_ref()
.and_then(|c| c.body.as_ref())
.map(|b| truncate_text(&strip_html_tags(b), 500))
});
let mut item_text = format!("[ITEM guid=\"{}\"]\nTitle: {}\n", guid, title);
if let Some(p) = &published {
item_text.push_str(&format!("Published: {}\n", p));
}
if let Some(l) = &link {
item_text.push_str(&format!("Link: {}\n", l));
}
if let Some(s) = &summary {
item_text.push_str(&format!("Summary: {}\n", s));
}
item_text.push_str("[/ITEM]");
items_by_key.insert(guid, item_text);
}
let formatted_text = items_by_key
.values()
.cloned()
.collect::<Vec<_>>()
.join("\n\n");
let title = feed.title.map(|t| t.content);
Ok(PageContent {
url: final_url,
title,
html: xml, text: Some(formatted_text), })
}
fn strip_html_tags(html: &str) -> String {
let text = HTML_TAG_RE.replace_all(html, " ");
let text = text
.replace(" ", " ")
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'");
WHITESPACE_RE.replace_all(&text, " ").trim().to_string()
}
fn truncate_text(text: &str, max_chars: usize) -> String {
if text.len() <= max_chars {
return text.to_string();
}
let truncated: String = text.chars().take(max_chars).collect();
if let Some(pos) = truncated.rfind(' ') {
format!("{}...", &truncated[..pos])
} else {
format!("{}...", truncated)
}
}
pub fn detect_rss_url(url: &str) -> bool {
let lower = url.to_lowercase();
lower.contains("/feed")
|| lower.contains("/rss")
|| lower.contains("/atom")
|| lower.ends_with(".xml")
|| lower.ends_with(".rss")
|| lower.ends_with(".atom")
|| lower.contains("rss.") }
pub fn detect_rss_content(body: &str) -> bool {
let prefix: String = body.chars().take(500).collect();
let prefix_lower = prefix.to_lowercase();
prefix_lower.contains("<rss")
|| prefix_lower.contains("<feed")
|| prefix_lower.contains("xmlns=\"http://www.w3.org/2005/atom\"")
}
pub fn probe_url(url: &str) -> Result<ProbeResult> {
use scraper::Html;
let headers = HashMap::new();
if detect_rss_url(url) {
return Ok(ProbeResult {
suggested_engine: Engine::Rss,
rss_url: Some(url.to_string()),
has_jsonld: false,
jsonld_type: None,
content_length: 0,
has_bot_protection: false,
message: Some("URL appears to be an RSS/Atom feed".to_string()),
});
}
let content = fetch_http(url, &headers)?;
let html_lower = content.html.to_lowercase();
if detect_rss_content(&content.html) {
return Ok(ProbeResult {
suggested_engine: Engine::Rss,
rss_url: Some(url.to_string()),
has_jsonld: false,
jsonld_type: None,
content_length: content.html.len(),
has_bot_protection: false,
message: Some("Content is RSS/Atom feed".to_string()),
});
}
let document = Html::parse_document(&content.html);
let rss_url = find_rss_link(&document, &content.url);
let (has_jsonld, jsonld_type) = detect_jsonld(&document);
let has_bot_protection = html_lower.contains("cloudflare")
|| html_lower.contains("cf-ray")
|| html_lower.contains("captcha")
|| html_lower.contains("please enable javascript")
|| html_lower.contains("browser check")
|| html_lower.contains("checking your browser");
let content_length = if let Ok(readability) = readability_js::Readability::new() {
if let Ok(article) = readability.parse(&content.html) {
article.text_content.trim().len()
} else {
extract_visible_text(&document)
}
} else {
extract_visible_text(&document)
};
let (suggested_engine, message) = determine_engine(
content_length,
has_bot_protection,
&rss_url,
has_jsonld,
&jsonld_type,
);
Ok(ProbeResult {
suggested_engine,
rss_url,
has_jsonld,
jsonld_type,
content_length,
has_bot_protection,
message,
})
}
fn find_rss_link(document: &scraper::Html, base_url: &str) -> Option<String> {
use scraper::Selector;
let rss_selector = Selector::parse(
r#"link[rel="alternate"][type="application/rss+xml"], link[rel="alternate"][type="application/atom+xml"]"#
).ok()?;
let href = document
.select(&rss_selector)
.next()
.and_then(|el| el.value().attr("href"))?;
resolve_url(base_url, href)
}
fn resolve_url(base: &str, relative: &str) -> Option<String> {
if relative.starts_with("http://") || relative.starts_with("https://") {
return Some(relative.to_string());
}
let base_url = url::Url::parse(base).ok()?;
base_url.join(relative).ok().map(|u| u.to_string())
}
fn detect_jsonld(document: &scraper::Html) -> (bool, Option<String>) {
use scraper::Selector;
let jsonld_selector = Selector::parse(r#"script[type="application/ld+json"]"#).ok();
if let Some(selector) = jsonld_selector {
for script in document.select(&selector) {
let text: String = script.text().collect();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
let types = extract_jsonld_types(&json);
if !types.is_empty() {
return (true, Some(types.join(", ")));
}
return (true, None);
}
}
}
(false, None)
}
fn extract_jsonld_types(json: &serde_json::Value) -> Vec<String> {
let mut types = Vec::new();
match json {
serde_json::Value::Object(map) => {
if let Some(t) = map.get("@type") {
match t {
serde_json::Value::String(s) => types.push(s.clone()),
serde_json::Value::Array(arr) => {
for item in arr {
if let serde_json::Value::String(s) = item {
types.push(s.clone());
}
}
}
_ => {}
}
}
if let Some(serde_json::Value::Array(graph)) = map.get("@graph") {
for item in graph {
types.extend(extract_jsonld_types(item));
}
}
}
serde_json::Value::Array(arr) => {
for item in arr {
types.extend(extract_jsonld_types(item));
}
}
_ => {}
}
types
}
fn extract_visible_text(document: &scraper::Html) -> usize {
use scraper::Selector;
if let Ok(body_selector) = Selector::parse("body") {
if let Some(body) = document.select(&body_selector).next() {
let text: String = body.text().collect::<Vec<_>>().join(" ");
return WHITESPACE_RE.replace_all(&text, " ").trim().len();
}
}
0
}
fn determine_engine(
content_length: usize,
has_bot_protection: bool,
rss_url: &Option<String>,
has_jsonld: bool,
jsonld_type: &Option<String>,
) -> (Engine, Option<String>) {
if content_length < 300 {
return (
Engine::Playwright,
Some("Page has minimal content - likely requires JavaScript".to_string()),
);
}
if has_bot_protection {
return (
Engine::Playwright,
Some("Bot protection detected - JavaScript rendering recommended".to_string()),
);
}
let mut info_parts = Vec::new();
if rss_url.is_some() {
info_parts.push("RSS feed available".to_string());
}
if has_jsonld {
if let Some(t) = jsonld_type {
info_parts.push(format!("JSON-LD: {}", t));
} else {
info_parts.push("JSON-LD structured data found".to_string());
}
}
let message = if info_parts.is_empty() {
None
} else {
Some(info_parts.join(", "))
};
(Engine::Http, message)
}
#[derive(Debug, Clone)]
pub struct DiscoveredFeed {
pub url: String,
pub title: Option<String>,
pub feed_type: String,
pub discovery_method: String,
}
const COMMON_FEED_PATHS: &[&str] = &[
"/feed",
"/rss",
"/atom",
"/feed.xml",
"/rss.xml",
"/atom.xml",
"/index.xml",
"/feed.rss",
"/blog/feed",
"/blog/rss",
"/news/rss",
"/posts.atom",
"/.rss",
];
pub fn discover_feeds(base_url: &str, html: &str) -> Vec<DiscoveredFeed> {
let mut feeds = Vec::new();
let document = scraper::Html::parse_document(html);
if let Ok(selector) = scraper::Selector::parse(
r#"link[rel="alternate"][type="application/rss+xml"],
link[rel="alternate"][type="application/atom+xml"],
link[rel="alternate"][type="application/feed+json"]"#
) {
for el in document.select(&selector) {
if let Some(href) = el.value().attr("href") {
if let Some(resolved) = resolve_url(base_url, href) {
let feed_type = el.value().attr("type")
.map(|t| {
if t.contains("atom") { "atom" }
else if t.contains("json") { "json" }
else { "rss" }
})
.unwrap_or("rss");
let title = el.value().attr("title").map(String::from);
feeds.push(DiscoveredFeed {
url: resolved,
title,
feed_type: feed_type.to_string(),
discovery_method: "link-tag".to_string(),
});
}
}
}
}
if let Ok(parsed_base) = url::Url::parse(base_url) {
for path in COMMON_FEED_PATHS {
let mut probe_url = parsed_base.clone();
if path.starts_with("/.") {
let current = probe_url.path().to_string();
probe_url.set_path(&format!("{}{}", current.trim_end_matches('/'), path));
} else {
probe_url.set_path(path);
}
let url_str = probe_url.to_string();
if feeds.iter().any(|f| f.url == url_str) {
continue;
}
if let Ok(true) = probe_is_feed(&url_str) {
let feed_type = if path.contains("atom") { "atom" }
else if path.contains("json") { "json" }
else { "rss" };
feeds.push(DiscoveredFeed {
url: url_str,
title: None,
feed_type: feed_type.to_string(),
discovery_method: "common-path".to_string(),
});
}
}
}
feeds
}
fn probe_is_feed(url: &str) -> Result<bool> {
let request = HTTP_AGENT.get(url)
.header("User-Agent", "Mozilla/5.0 (compatible; kto/0.1)");
match request.call() {
Ok(response) => {
if let Some(ct) = response.headers().get("content-type") {
if let Ok(ct_str) = ct.to_str() {
if ct_str.contains("xml") || ct_str.contains("rss")
|| ct_str.contains("atom") || ct_str.contains("feed") {
return Ok(true);
}
}
}
if let Ok(body) = response.into_body().read_to_string() {
Ok(detect_rss_content(&body))
} else {
Ok(false)
}
}
Err(_) => Ok(false),
}
}
pub fn detect_site_type(url: &str, html: &str) -> Option<String> {
let lower_html = html.to_lowercase();
if lower_html.contains("shopify")
|| lower_html.contains("cdn.shopify.com")
|| lower_html.contains("myshopify.com") {
return Some("Shopify store".to_string());
}
if lower_html.contains("wp-content")
|| lower_html.contains("wordpress")
|| lower_html.contains("wp-json") {
return Some("WordPress site".to_string());
}
if lower_html.contains("wix.com")
|| lower_html.contains("wixstatic.com") {
return Some("Wix site".to_string());
}
if lower_html.contains("squarespace")
|| lower_html.contains("sqsp.net") {
return Some("Squarespace site".to_string());
}
if lower_html.contains("webflow.com") {
return Some("Webflow site".to_string());
}
if lower_html.contains("bigcommerce")
|| lower_html.contains("bc-sf-filter") {
return Some("BigCommerce store".to_string());
}
if lower_html.contains("woocommerce")
|| lower_html.contains("wc-product") {
return Some("WooCommerce store".to_string());
}
if lower_html.contains("magento")
|| lower_html.contains("mage-init") {
return Some("Magento store".to_string());
}
if let Ok(parsed) = url::Url::parse(url) {
if parsed.host_str() == Some("github.com") {
return Some("GitHub repository".to_string());
}
if parsed.host_str() == Some("gitlab.com") {
return Some("GitLab repository".to_string());
}
}
if let Ok(selector) = scraper::Selector::parse("meta[name='generator']") {
let document = scraper::Html::parse_document(html);
if let Some(meta) = document.select(&selector).next() {
if let Some(generator) = meta.value().attr("content") {
return Some(format!("{} site", generator.split_whitespace().next().unwrap_or("Unknown")));
}
}
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_playwright_status() {
let status = PlaywrightStatus::BrowserMissing;
assert!(!status.is_ready());
assert!(status.install_instructions().contains("npx playwright"));
}
#[test]
fn test_resolve_url_absolute() {
let result = resolve_url("https://example.com", "https://other.com/feed");
assert_eq!(result, Some("https://other.com/feed".to_string()));
}
#[test]
fn test_resolve_url_relative_path() {
let result = resolve_url("https://example.com/blog/post", "/feed.xml");
assert_eq!(result, Some("https://example.com/feed.xml".to_string()));
}
#[test]
fn test_resolve_url_relative_no_slash() {
let result = resolve_url("https://example.com/blog/", "feed.xml");
assert_eq!(result, Some("https://example.com/blog/feed.xml".to_string()));
}
#[test]
fn test_resolve_url_dot_relative() {
let result = resolve_url("https://example.com/blog/post", "../feed.xml");
assert_eq!(result, Some("https://example.com/feed.xml".to_string()));
}
#[test]
fn test_detect_rss_url() {
assert!(detect_rss_url("https://example.com/feed.xml"));
assert!(detect_rss_url("https://example.com/feed"));
assert!(detect_rss_url("https://example.com/rss"));
assert!(detect_rss_url("https://rss.example.com/news"));
assert!(!detect_rss_url("https://example.com/page"));
}
#[test]
fn test_detect_rss_content() {
assert!(detect_rss_content("<?xml version=\"1.0\"?><rss version=\"2.0\">"));
assert!(detect_rss_content("<feed xmlns=\"http://www.w3.org/2005/Atom\">"));
assert!(!detect_rss_content("<html><head></head><body></body></html>"));
}
#[test]
fn test_discover_feeds_from_link_tags() {
let html = r#"
<html>
<head>
<link rel="alternate" type="application/rss+xml" href="/feed.xml" title="RSS Feed">
<link rel="alternate" type="application/atom+xml" href="https://example.com/atom.xml">
</head>
<body></body>
</html>
"#;
let feeds = discover_feeds("https://example.com/page", html);
assert_eq!(feeds.len(), 2);
assert_eq!(feeds[0].feed_type, "rss");
assert_eq!(feeds[0].discovery_method, "link-tag");
assert_eq!(feeds[1].feed_type, "atom");
}
#[test]
fn test_detect_site_type_shopify() {
let html = r#"<html><head><script src="https://cdn.shopify.com/s/files/1/0123/shop.js"></script></head></html>"#;
let site_type = detect_site_type("https://example.com", html);
assert_eq!(site_type, Some("Shopify store".to_string()));
}
#[test]
fn test_detect_site_type_wordpress() {
let html = r#"<html><head><link rel="stylesheet" href="/wp-content/themes/style.css"></head></html>"#;
let site_type = detect_site_type("https://example.com", html);
assert_eq!(site_type, Some("WordPress site".to_string()));
}
#[test]
fn test_detect_site_type_github() {
let site_type = detect_site_type("https://github.com/user/repo", "<html></html>");
assert_eq!(site_type, Some("GitHub repository".to_string()));
}
}