Skip to main content

sparrow_intel/
lib.rs

1use anyhow::{Context, Result};
2use chrono::Utc;
3use rusqlite::{Connection, OptionalExtension, params};
4use serde::{Deserialize, Serialize};
5use sha2::{Digest as ShaDigest, Sha256};
6use std::path::Path;
7use std::time::Duration;
8
9#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
10#[serde(rename_all = "snake_case")]
11pub enum SourceKind {
12    GithubReleases,
13    ChangelogUrl,
14    DocsUrl,
15}
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct SourceConfig {
19    pub name: String,
20    pub kind: SourceKind,
21    pub url: String,
22    #[serde(default)]
23    pub tags: Vec<String>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct SourcesFile {
28    #[serde(default)]
29    pub source: Vec<SourceConfig>,
30}
31
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct IntelItem {
34    pub source: String,
35    pub version: String,
36    pub date: String,
37    pub body: String,
38    pub url: String,
39    pub etag: Option<String>,
40    pub tags: Vec<String>,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct IntelDigest {
45    pub source: String,
46    pub title: String,
47    pub summary: String,
48    pub version: String,
49    pub date: String,
50    pub url: String,
51    pub tags: Vec<String>,
52}
53
54#[derive(Debug, Clone, Serialize, Deserialize)]
55pub struct BacklogTicket {
56    pub title: String,
57    pub source: String,
58    pub score: u32,
59    pub reason: String,
60    pub url: String,
61    pub tags: Vec<String>,
62}
63
64#[derive(Debug, Clone, Serialize, Deserialize)]
65pub struct ScanReport {
66    pub scanned: usize,
67    pub inserted_or_updated: usize,
68    pub items: Vec<IntelItem>,
69}
70
71pub fn load_sources_file(path: &Path) -> Result<Vec<SourceConfig>> {
72    let raw = std::fs::read_to_string(path)
73        .with_context(|| format!("could not read intel sources file {}", path.display()))?;
74    let parsed: SourcesFile = toml::from_str(&raw)
75        .with_context(|| format!("could not parse intel sources file {}", path.display()))?;
76    Ok(parsed.source)
77}
78
79pub struct IntelCache {
80    path: std::path::PathBuf,
81}
82
83impl IntelCache {
84    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
85        let path = path.as_ref().to_path_buf();
86        if let Some(parent) = path.parent() {
87            std::fs::create_dir_all(parent)?;
88        }
89        let cache = Self { path };
90        cache.init()?;
91        Ok(cache)
92    }
93
94    fn conn(&self) -> Result<Connection> {
95        Ok(Connection::open(&self.path)?)
96    }
97
98    fn init(&self) -> Result<()> {
99        let conn = self.conn()?;
100        conn.execute_batch(
101            "CREATE TABLE IF NOT EXISTS intel_items (
102                source TEXT NOT NULL,
103                version TEXT NOT NULL,
104                date TEXT NOT NULL,
105                body TEXT NOT NULL,
106                url TEXT NOT NULL,
107                etag TEXT,
108                tags TEXT NOT NULL DEFAULT '[]',
109                updated_at INTEGER NOT NULL,
110                UNIQUE(source, version, url)
111            );",
112        )?;
113        Ok(())
114    }
115
116    pub fn upsert_items(&self, items: &[IntelItem]) -> Result<usize> {
117        let mut conn = self.conn()?;
118        let tx = conn.transaction()?;
119        let mut changed = 0usize;
120        for item in items {
121            let tags = serde_json::to_string(&item.tags)?;
122            changed += tx.execute(
123                "INSERT INTO intel_items (source, version, date, body, url, etag, tags, updated_at)
124                 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, strftime('%s','now'))
125                 ON CONFLICT(source, version, url) DO UPDATE SET
126                   date=excluded.date,
127                   body=excluded.body,
128                   etag=excluded.etag,
129                   tags=excluded.tags,
130                   updated_at=excluded.updated_at",
131                params![
132                    item.source,
133                    item.version,
134                    item.date,
135                    item.body,
136                    item.url,
137                    item.etag,
138                    tags
139                ],
140            )?;
141        }
142        tx.commit()?;
143        Ok(changed)
144    }
145
146    pub fn items(&self, limit: usize) -> Result<Vec<IntelItem>> {
147        let conn = self.conn()?;
148        let mut stmt = conn.prepare(
149            "SELECT source, version, date, body, url, etag, tags
150             FROM intel_items
151             ORDER BY date DESC, updated_at DESC
152             LIMIT ?1",
153        )?;
154        let rows = stmt.query_map([limit as i64], row_to_item)?;
155        Ok(rows.filter_map(|r| r.ok()).collect())
156    }
157
158    pub fn digests(&self, limit: usize) -> Result<Vec<IntelDigest>> {
159        Ok(self
160            .items(limit)?
161            .into_iter()
162            .map(|item| IntelDigest {
163                title: digest_title(&item),
164                summary: summarize(&item.body, 260),
165                version: item.version,
166                date: item.date,
167                url: item.url,
168                source: item.source,
169                tags: item.tags,
170            })
171            .collect())
172    }
173
174    pub fn backlog(&self, limit: usize) -> Result<Vec<BacklogTicket>> {
175        let mut tickets: Vec<_> = self
176            .digests(limit.saturating_mul(3).max(limit))?
177            .into_iter()
178            .map(|digest| {
179                let (score, reason) = score_digest(&digest);
180                BacklogTicket {
181                    title: format!("Évaluer {} {}", digest.source, digest.version),
182                    source: digest.source,
183                    score,
184                    reason,
185                    url: digest.url,
186                    tags: digest.tags,
187                }
188            })
189            .filter(|t| t.score > 0)
190            .collect();
191        tickets.sort_by(|a, b| b.score.cmp(&a.score).then_with(|| a.title.cmp(&b.title)));
192        tickets.truncate(limit);
193        Ok(tickets)
194    }
195
196    pub fn etag_for_url(&self, url: &str) -> Result<Option<String>> {
197        let conn = self.conn()?;
198        Ok(conn
199            .query_row(
200                "SELECT etag FROM intel_items WHERE url=?1 AND etag IS NOT NULL LIMIT 1",
201                [url],
202                |row| row.get(0),
203            )
204            .optional()?)
205    }
206}
207
208fn row_to_item(row: &rusqlite::Row<'_>) -> rusqlite::Result<IntelItem> {
209    let tags_raw: String = row.get(6)?;
210    let tags = serde_json::from_str(&tags_raw).unwrap_or_default();
211    Ok(IntelItem {
212        source: row.get(0)?,
213        version: row.get(1)?,
214        date: row.get(2)?,
215        body: row.get(3)?,
216        url: row.get(4)?,
217        etag: row.get(5)?,
218        tags,
219    })
220}
221
222pub async fn scan_sources(
223    sources: &[SourceConfig],
224    cache_path: impl AsRef<Path>,
225    limit_per_source: usize,
226) -> Result<ScanReport> {
227    let cache = IntelCache::open(cache_path)?;
228    let client = reqwest::Client::builder()
229        .timeout(Duration::from_secs(20))
230        .user_agent(format!("sparrow-intel/{}", env!("CARGO_PKG_VERSION")))
231        .build()?;
232    let mut all = Vec::new();
233    for source in sources {
234        let mut items = fetch_source(&client, source, limit_per_source).await?;
235        all.append(&mut items);
236    }
237    let changed = cache.upsert_items(&all)?;
238    Ok(ScanReport {
239        scanned: sources.len(),
240        inserted_or_updated: changed,
241        items: all,
242    })
243}
244
245async fn fetch_source(
246    client: &reqwest::Client,
247    source: &SourceConfig,
248    limit: usize,
249) -> Result<Vec<IntelItem>> {
250    match source.kind {
251        SourceKind::GithubReleases => fetch_github_releases(client, source, limit).await,
252        SourceKind::ChangelogUrl | SourceKind::DocsUrl => fetch_text_url(client, source).await,
253    }
254}
255
256async fn fetch_github_releases(
257    client: &reqwest::Client,
258    source: &SourceConfig,
259    limit: usize,
260) -> Result<Vec<IntelItem>> {
261    let api = github_releases_api(&source.url)?;
262    let resp = client.get(api).send().await?.error_for_status()?;
263    let releases: Vec<serde_json::Value> = resp.json().await?;
264    Ok(releases
265        .into_iter()
266        .take(limit)
267        .map(|release| {
268            let version = release
269                .get("tag_name")
270                .and_then(|v| v.as_str())
271                .unwrap_or("release")
272                .to_string();
273            let name = release
274                .get("name")
275                .and_then(|v| v.as_str())
276                .unwrap_or(&version);
277            let body = release
278                .get("body")
279                .and_then(|v| v.as_str())
280                .unwrap_or("")
281                .to_string();
282            let date = release
283                .get("published_at")
284                .and_then(|v| v.as_str())
285                .unwrap_or("")
286                .to_string();
287            let url = release
288                .get("html_url")
289                .and_then(|v| v.as_str())
290                .unwrap_or(&source.url)
291                .to_string();
292            IntelItem {
293                source: source.name.clone(),
294                version: version.clone(),
295                date,
296                body: format!("{name}\n\n{body}"),
297                url,
298                etag: None,
299                tags: source.tags.clone(),
300            }
301        })
302        .collect())
303}
304
305async fn fetch_text_url(client: &reqwest::Client, source: &SourceConfig) -> Result<Vec<IntelItem>> {
306    let resp = client.get(&source.url).send().await?.error_for_status()?;
307    let etag = resp
308        .headers()
309        .get(reqwest::header::ETAG)
310        .and_then(|h| h.to_str().ok())
311        .map(str::to_string);
312    let body = resp.text().await?;
313    let version = short_hash(&body);
314    let date = Utc::now().to_rfc3339();
315    Ok(vec![IntelItem {
316        source: source.name.clone(),
317        version,
318        date,
319        body,
320        url: source.url.clone(),
321        etag,
322        tags: source.tags.clone(),
323    }])
324}
325
326fn github_releases_api(raw: &str) -> Result<String> {
327    if raw.contains("api.github.com/repos/") {
328        return Ok(raw.to_string());
329    }
330    let parsed = url::Url::parse(raw)?;
331    let host = parsed.host_str().unwrap_or_default();
332    if host != "github.com" {
333        anyhow::bail!("github_releases source must point at github.com or api.github.com");
334    }
335    let parts: Vec<_> = parsed
336        .path_segments()
337        .map(|s| s.collect::<Vec<_>>())
338        .unwrap_or_default();
339    if parts.len() < 2 {
340        anyhow::bail!("github_releases source must include owner/repo");
341    }
342    Ok(format!(
343        "https://api.github.com/repos/{}/{}/releases",
344        parts[0], parts[1]
345    ))
346}
347
348fn short_hash(body: &str) -> String {
349    let mut hasher = Sha256::new();
350    hasher.update(body.as_bytes());
351    format!("{:.12x}", hasher.finalize())
352}
353
354fn digest_title(item: &IntelItem) -> String {
355    item.body
356        .lines()
357        .find(|l| !l.trim().is_empty())
358        .map(|l| summarize(l, 90))
359        .unwrap_or_else(|| item.version.clone())
360}
361
362fn summarize(text: &str, max: usize) -> String {
363    let compact = text.split_whitespace().collect::<Vec<_>>().join(" ");
364    if compact.len() <= max {
365        compact
366    } else {
367        format!(
368            "{}...",
369            compact
370                .chars()
371                .take(max.saturating_sub(3))
372                .collect::<String>()
373        )
374    }
375}
376
377fn score_digest(digest: &IntelDigest) -> (u32, String) {
378    const SIGNALS: &[(&str, u32, &str)] = &[
379        ("agent", 20, "agentic workflow"),
380        ("tool", 14, "tooling/API"),
381        ("mcp", 18, "MCP compatibility"),
382        ("permission", 14, "permissions"),
383        ("sandbox", 14, "sandbox safety"),
384        ("approval", 12, "approvals"),
385        ("checkpoint", 12, "checkpoint/replay"),
386        ("replay", 12, "checkpoint/replay"),
387        ("webview", 10, "cockpit UI"),
388        ("performance", 10, "performance"),
389        ("routing", 10, "model routing"),
390        ("memory", 8, "memory/context"),
391        ("context", 8, "memory/context"),
392        ("release", 6, "release intelligence"),
393    ];
394    let hay = format!(
395        "{} {} {} {}",
396        digest.title,
397        digest.summary,
398        digest.source,
399        digest.tags.join(" ")
400    )
401    .to_lowercase();
402    let mut score = 0;
403    let mut reasons = Vec::new();
404    for (needle, weight, reason) in SIGNALS {
405        if hay.contains(needle) {
406            score += *weight;
407            if !reasons.contains(reason) {
408                reasons.push(*reason);
409            }
410        }
411    }
412    (score.min(100), reasons.join(", "))
413}
414
415pub fn default_cache_path(state_dir: &Path) -> std::path::PathBuf {
416    state_dir.join("intel.sqlite")
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    fn item(body: &str) -> IntelItem {
424        IntelItem {
425            source: "test".into(),
426            version: "v1".into(),
427            date: "2026-06-12T00:00:00Z".into(),
428            body: body.into(),
429            url: "https://example.test/release".into(),
430            etag: None,
431            tags: vec!["agent".into()],
432        }
433    }
434
435    #[test]
436    fn cache_round_trips_digests_and_backlog() {
437        let dir = tempfile::tempdir().unwrap();
438        let cache = IntelCache::open(dir.path().join("intel.sqlite")).unwrap();
439        cache
440            .upsert_items(&[item("Agent tool sandbox release with replay support")])
441            .unwrap();
442        let digests = cache.digests(10).unwrap();
443        assert_eq!(digests.len(), 1);
444        let backlog = cache.backlog(10).unwrap();
445        assert_eq!(backlog.len(), 1);
446        assert!(backlog[0].score >= 40);
447    }
448
449    #[test]
450    fn parses_sources_file() {
451        let dir = tempfile::tempdir().unwrap();
452        let path = dir.path().join("sources.toml");
453        std::fs::write(
454            &path,
455            r#"
456[[source]]
457name = "Codex"
458kind = "github_releases"
459url = "https://github.com/openai/codex"
460tags = ["agent", "cli"]
461"#,
462        )
463        .unwrap();
464        let sources = load_sources_file(&path).unwrap();
465        assert_eq!(sources.len(), 1);
466        assert_eq!(sources[0].kind, SourceKind::GithubReleases);
467    }
468
469    #[test]
470    fn github_api_url_is_derived_from_repo_url() {
471        let api = github_releases_api("https://github.com/openai/codex").unwrap();
472        assert_eq!(api, "https://api.github.com/repos/openai/codex/releases");
473    }
474}