sparrow-intel 0.9.2

use anyhow::{Context, Result};
use chrono::Utc;
use rusqlite::{Connection, OptionalExtension, params};
use serde::{Deserialize, Serialize};
use sha2::{Digest as ShaDigest, Sha256};
use std::path::Path;
use std::time::Duration;

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum SourceKind {
    GithubReleases,
    ChangelogUrl,
    DocsUrl,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourceConfig {
    pub name: String,
    pub kind: SourceKind,
    pub url: String,
    #[serde(default)]
    pub tags: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SourcesFile {
    #[serde(default)]
    pub source: Vec<SourceConfig>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IntelItem {
    pub source: String,
    pub version: String,
    pub date: String,
    pub body: String,
    pub url: String,
    pub etag: Option<String>,
    pub tags: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IntelDigest {
    pub source: String,
    pub title: String,
    pub summary: String,
    pub version: String,
    pub date: String,
    pub url: String,
    pub tags: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BacklogTicket {
    pub title: String,
    pub source: String,
    pub score: u32,
    pub reason: String,
    pub url: String,
    pub tags: Vec<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanReport {
    pub scanned: usize,
    pub inserted_or_updated: usize,
    pub items: Vec<IntelItem>,
}

pub fn load_sources_file(path: &Path) -> Result<Vec<SourceConfig>> {
    let raw = std::fs::read_to_string(path)
        .with_context(|| format!("could not read intel sources file {}", path.display()))?;
    let parsed: SourcesFile = toml::from_str(&raw)
        .with_context(|| format!("could not parse intel sources file {}", path.display()))?;
    Ok(parsed.source)
}

pub struct IntelCache {
    path: std::path::PathBuf,
}

impl IntelCache {
    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
        let path = path.as_ref().to_path_buf();
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent)?;
        }
        let cache = Self { path };
        cache.init()?;
        Ok(cache)
    }

    fn conn(&self) -> Result<Connection> {
        Ok(Connection::open(&self.path)?)
    }

    fn init(&self) -> Result<()> {
        let conn = self.conn()?;
        conn.execute_batch(
            "CREATE TABLE IF NOT EXISTS intel_items (
                source TEXT NOT NULL,
                version TEXT NOT NULL,
                date TEXT NOT NULL,
                body TEXT NOT NULL,
                url TEXT NOT NULL,
                etag TEXT,
                tags TEXT NOT NULL DEFAULT '[]',
                updated_at INTEGER NOT NULL,
                UNIQUE(source, version, url)
            );",
        )?;
        Ok(())
    }

    pub fn upsert_items(&self, items: &[IntelItem]) -> Result<usize> {
        let mut conn = self.conn()?;
        let tx = conn.transaction()?;
        let mut changed = 0usize;
        for item in items {
            let tags = serde_json::to_string(&item.tags)?;
            changed += tx.execute(
                "INSERT INTO intel_items (source, version, date, body, url, etag, tags, updated_at)
                 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, strftime('%s','now'))
                 ON CONFLICT(source, version, url) DO UPDATE SET
                   date=excluded.date,
                   body=excluded.body,
                   etag=excluded.etag,
                   tags=excluded.tags,
                   updated_at=excluded.updated_at",
                params![
                    item.source,
                    item.version,
                    item.date,
                    item.body,
                    item.url,
                    item.etag,
                    tags
                ],
            )?;
        }
        tx.commit()?;
        Ok(changed)
    }

    pub fn items(&self, limit: usize) -> Result<Vec<IntelItem>> {
        let conn = self.conn()?;
        let mut stmt = conn.prepare(
            "SELECT source, version, date, body, url, etag, tags
             FROM intel_items
             ORDER BY date DESC, updated_at DESC
             LIMIT ?1",
        )?;
        let rows = stmt.query_map([limit as i64], row_to_item)?;
        Ok(rows.filter_map(|r| r.ok()).collect())
    }

    pub fn digests(&self, limit: usize) -> Result<Vec<IntelDigest>> {
        Ok(self
            .items(limit)?
            .into_iter()
            .map(|item| IntelDigest {
                title: digest_title(&item),
                summary: summarize(&item.body, 260),
                version: item.version,
                date: item.date,
                url: item.url,
                source: item.source,
                tags: item.tags,
            })
            .collect())
    }

    pub fn backlog(&self, limit: usize) -> Result<Vec<BacklogTicket>> {
        let mut tickets: Vec<_> = self
            .digests(limit.saturating_mul(3).max(limit))?
            .into_iter()
            .map(|digest| {
                let (score, reason) = score_digest(&digest);
                BacklogTicket {
                    title: format!("Évaluer {} {}", digest.source, digest.version),
                    source: digest.source,
                    score,
                    reason,
                    url: digest.url,
                    tags: digest.tags,
                }
            })
            .filter(|t| t.score > 0)
            .collect();
        tickets.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.title.cmp(&b.title)));
        tickets.truncate(limit);
        Ok(tickets)
    }

    pub fn etag_for_url(&self, url: &str) -> Result<Option<String>> {
        let conn = self.conn()?;
        Ok(conn
            .query_row(
                "SELECT etag FROM intel_items WHERE url=?1 AND etag IS NOT NULL LIMIT 1",
                [url],
                |row| row.get(0),
            )
            .optional()?)
    }
}

fn row_to_item(row: &rusqlite::Row<'_>) -> rusqlite::Result<IntelItem> {
    let tags_raw: String = row.get(6)?;
    let tags = serde_json::from_str(&tags_raw).unwrap_or_default();
    Ok(IntelItem {
        source: row.get(0)?,
        version: row.get(1)?,
        date: row.get(2)?,
        body: row.get(3)?,
        url: row.get(4)?,
        etag: row.get(5)?,
        tags,
    })
}

pub async fn scan_sources(
    sources: &[SourceConfig],
    cache_path: impl AsRef<Path>,
    limit_per_source: usize,
) -> Result<ScanReport> {
    let cache = IntelCache::open(cache_path)?;
    let client = reqwest::Client::builder()
        .timeout(Duration::from_secs(20))
        .user_agent(format!("sparrow-intel/{}", env!("CARGO_PKG_VERSION")))
        .build()?;
    let mut all = Vec::new();
    for source in sources {
        let mut items = fetch_source(&client, source, limit_per_source).await?;
        all.append(&mut items);
    }
    let changed = cache.upsert_items(&all)?;
    Ok(ScanReport {
        scanned: sources.len(),
        inserted_or_updated: changed,
        items: all,
    })
}

async fn fetch_source(
    client: &reqwest::Client,
    source: &SourceConfig,
    limit: usize,
) -> Result<Vec<IntelItem>> {
    match source.kind {
        SourceKind::GithubReleases => fetch_github_releases(client, source, limit).await,
        SourceKind::ChangelogUrl | SourceKind::DocsUrl => fetch_text_url(client, source).await,
    }
}

async fn fetch_github_releases(
    client: &reqwest::Client,
    source: &SourceConfig,
    limit: usize,
) -> Result<Vec<IntelItem>> {
    let api = github_releases_api(&source.url)?;
    let resp = client.get(api).send().await?.error_for_status()?;
    let releases: Vec<serde_json::Value> = resp.json().await?;
    Ok(releases
        .into_iter()
        .take(limit)
        .map(|release| {
            let version = release
                .get("tag_name")
                .and_then(|v| v.as_str())
                .unwrap_or("release")
                .to_string();
            let name = release
                .get("name")
                .and_then(|v| v.as_str())
                .unwrap_or(&version);
            let body = release
                .get("body")
                .and_then(|v| v.as_str())
                .unwrap_or("")
                .to_string();
            let date = release
                .get("published_at")
                .and_then(|v| v.as_str())
                .unwrap_or("")
                .to_string();
            let url = release
                .get("html_url")
                .and_then(|v| v.as_str())
                .unwrap_or(&source.url)
                .to_string();
            IntelItem {
                source: source.name.clone(),
                version: version.clone(),
                date,
                body: format!("{name}\n\n{body}"),
                url,
                etag: None,
                tags: source.tags.clone(),
            }
        })
        .collect())
}

async fn fetch_text_url(client: &reqwest::Client, source: &SourceConfig) -> Result<Vec<IntelItem>> {
    let resp = client.get(&source.url).send().await?.error_for_status()?;
    let etag = resp
        .headers()
        .get(reqwest::header::ETAG)
        .and_then(|h| h.to_str().ok())
        .map(str::to_string);
    let body = resp.text().await?;
    let version = short_hash(&body);
    let date = Utc::now().to_rfc3339();
    Ok(vec![IntelItem {
        source: source.name.clone(),
        version,
        date,
        body,
        url: source.url.clone(),
        etag,
        tags: source.tags.clone(),
    }])
}

fn github_releases_api(raw: &str) -> Result<String> {
    if raw.contains("api.github.com/repos/") {
        return Ok(raw.to_string());
    }
    let parsed = url::Url::parse(raw)?;
    let host = parsed.host_str().unwrap_or_default();
    if host != "github.com" {
        anyhow::bail!("github_releases source must point at github.com or api.github.com");
    }
    let parts: Vec<_> = parsed
        .path_segments()
        .map(|s| s.collect::<Vec<_>>())
        .unwrap_or_default();
    if parts.len() < 2 {
        anyhow::bail!("github_releases source must include owner/repo");
    }
    Ok(format!(
        "https://api.github.com/repos/{}/{}/releases",
        parts[0], parts[1]
    ))
}

fn short_hash(body: &str) -> String {
    let mut hasher = Sha256::new();
    hasher.update(body.as_bytes());
    format!("{:.12x}", hasher.finalize())
}

fn digest_title(item: &IntelItem) -> String {
    item.body
        .lines()
        .find(|l| !l.trim().is_empty())
        .map(|l| summarize(l, 90))
        .unwrap_or_else(|| item.version.clone())
}

fn summarize(text: &str, max: usize) -> String {
    let compact = text.split_whitespace().collect::<Vec<_>>().join(" ");
    if compact.len() <= max {
        compact
    } else {
        format!(
            "{}...",
            compact
                .chars()
                .take(max.saturating_sub(3))
                .collect::<String>()
        )
    }
}

fn score_digest(digest: &IntelDigest) -> (u32, String) {
    const SIGNALS: &[(&str, u32, &str)] = &[
        ("agent", 20, "agentic workflow"),
        ("tool", 14, "tooling/API"),
        ("mcp", 18, "MCP compatibility"),
        ("permission", 14, "permissions"),
        ("sandbox", 14, "sandbox safety"),
        ("approval", 12, "approvals"),
        ("checkpoint", 12, "checkpoint/replay"),
        ("replay", 12, "checkpoint/replay"),
        ("webview", 10, "cockpit UI"),
        ("performance", 10, "performance"),
        ("routing", 10, "model routing"),
        ("memory", 8, "memory/context"),
        ("context", 8, "memory/context"),
        ("release", 6, "release intelligence"),
    ];
    let hay = format!(
        "{} {} {} {}",
        digest.title,
        digest.summary,
        digest.source,
        digest.tags.join(" ")
    )
    .to_lowercase();
    let mut score = 0;
    let mut reasons = Vec::new();
    for (needle, weight, reason) in SIGNALS {
        if hay.contains(needle) {
            score += *weight;
            if !reasons.contains(reason) {
                reasons.push(*reason);
            }
        }
    }
    (score.min(100), reasons.join(", "))
}

pub fn default_cache_path(state_dir: &Path) -> std::path::PathBuf {
    state_dir.join("intel.sqlite")
}

#[cfg(test)]
mod tests {
    use super::*;

    fn item(body: &str) -> IntelItem {
        IntelItem {
            source: "test".into(),
            version: "v1".into(),
            date: "2026-06-12T00:00:00Z".into(),
            body: body.into(),
            url: "https://example.test/release".into(),
            etag: None,
            tags: vec!["agent".into()],
        }
    }

    #[test]
    fn cache_round_trips_digests_and_backlog() {
        let dir = tempfile::tempdir().unwrap();
        let cache = IntelCache::open(dir.path().join("intel.sqlite")).unwrap();
        cache
            .upsert_items(&[item("Agent tool sandbox release with replay support")])
            .unwrap();
        let digests = cache.digests(10).unwrap();
        assert_eq!(digests.len(), 1);
        let backlog = cache.backlog(10).unwrap();
        assert_eq!(backlog.len(), 1);
        assert!(backlog[0].score >= 40);
    }

    #[test]
    fn parses_sources_file() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("sources.toml");
        std::fs::write(
            &path,
            r#"
[[source]]
name = "Codex"
kind = "github_releases"
url = "https://github.com/openai/codex"
tags = ["agent", "cli"]
"#,
        )
        .unwrap();
        let sources = load_sources_file(&path).unwrap();
        assert_eq!(sources.len(), 1);
        assert_eq!(sources[0].kind, SourceKind::GithubReleases);
    }

    #[test]
    fn github_api_url_is_derived_from_repo_url() {
        let api = github_releases_api("https://github.com/openai/codex").unwrap();
        assert_eq!(api, "https://api.github.com/repos/openai/codex/releases");
    }
}