Skip to main content

smooth_operator/tools/
github_search.rs

1//! The `github_search` tool — live GitHub code/issue search.
2//!
3//! Where [`KnowledgeSearchTool`](crate::tools::KnowledgeSearchTool) searches the
4//! *indexed snapshot* a `GithubConnector` ingested, `github_search` hits the
5//! **live** GitHub search API so the agent can find code or issues that landed
6//! after the last ingest — fresh lookups beyond the indexed corpus.
7//!
8//! ## Shape
9//!
10//! Arguments: `{ "query": string, "kind"?: "code" | "issues" }`. The tool runs
11//! the query through a pluggable [`GithubSearchBackend`] (default:
12//! [`OctocrabGithubSearch`], the real GitHub API) and renders the top results
13//! (title, URL, snippet).
14//!
15//! ## Scope + auth
16//!
17//! The tool is constructed with a [`GithubAuth`] and a default `owner/repo`
18//! scope; the scope is folded into the search query (`repo:owner/name`) so the
19//! agent's lookups stay within the team's repos by default.
20//!
21//! ## Test split (G9)
22//!
23//! The live network is behind the [`OctocrabGithubSearch`] backend, exercised
24//! only by an `#[ignore]` + env-gated (`SMOOTH_AGENT_E2E=1`) test. The tool's
25//! arg-parsing and result-formatting are unit-tested **offline** against a stub
26//! backend, exactly like the `web_search` tool.
27
28use std::sync::Arc;
29
30use async_trait::async_trait;
31use serde::{Deserialize, Serialize};
32
33use smooth_operator_core::tool::ToolSchema;
34use smooth_operator_core::Tool;
35
36/// Default number of results requested when the caller doesn't specify a limit.
37const DEFAULT_RESULTS: usize = 5;
38/// Hard cap on results regardless of what the model asks for.
39const MAX_RESULTS: usize = 20;
40
41/// How the `github_search` tool authenticates to the GitHub API.
42///
43/// Mirrors the ingestion connector's auth shape (kept independent so the tool
44/// crate doesn't depend on the ingestion crate):
45/// - [`GithubAuth::Token`] — a personal-access token (self-host's simplest path),
46/// - [`GithubAuth::AppInstallation`] — Smoo's first-party GitHub App, the way
47///   `lom.smoo.ai` wires per-customer access,
48/// - [`GithubAuth::Unauthenticated`] — public search at the anonymous rate limit.
49#[derive(Clone)]
50pub enum GithubAuth {
51    /// A personal-access token (PAT).
52    Token(String),
53    /// A GitHub App installation: app id, PEM private key, installation id.
54    AppInstallation {
55        /// The GitHub App's numeric id.
56        app_id: u64,
57        /// The App's RSA private key, in PEM form.
58        private_key: String,
59        /// The installation id (the org/user that installed the App).
60        installation_id: u64,
61    },
62    /// No credentials (public search; anonymous rate limit).
63    Unauthenticated,
64}
65
66impl std::fmt::Debug for GithubAuth {
67    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
68        match self {
69            Self::Token(_) => f.write_str("GithubAuth::Token(***)"),
70            Self::AppInstallation {
71                app_id,
72                installation_id,
73                ..
74            } => f
75                .debug_struct("GithubAuth::AppInstallation")
76                .field("app_id", app_id)
77                .field("installation_id", installation_id)
78                .field("private_key", &"***")
79                .finish(),
80            Self::Unauthenticated => f.write_str("GithubAuth::Unauthenticated"),
81        }
82    }
83}
84
85/// Which GitHub search index to query.
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub enum GithubSearchKind {
88    /// Search source code (`/search/code`).
89    Code,
90    /// Search issues + pull requests (`/search/issues`).
91    Issues,
92}
93
94impl GithubSearchKind {
95    /// Parse the `kind` argument; defaults to [`GithubSearchKind::Code`].
96    fn parse(raw: Option<&str>) -> Self {
97        match raw.map(str::to_ascii_lowercase).as_deref() {
98            Some("issue" | "issues" | "pr" | "prs") => Self::Issues,
99            _ => Self::Code,
100        }
101    }
102
103    fn label(self) -> &'static str {
104        match self {
105            Self::Code => "code",
106            Self::Issues => "issues",
107        }
108    }
109}
110
111/// A single GitHub search hit.
112#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
113#[serde(rename_all = "camelCase")]
114pub struct GithubSearchResult {
115    /// The result title (file path for code, issue title for issues).
116    pub title: String,
117    /// The result URL on github.com.
118    pub url: String,
119    /// A short snippet / summary.
120    pub snippet: String,
121}
122
123impl GithubSearchResult {
124    /// Convenience constructor.
125    pub fn new(
126        title: impl Into<String>,
127        url: impl Into<String>,
128        snippet: impl Into<String>,
129    ) -> Self {
130        Self {
131            title: title.into(),
132            url: url.into(),
133            snippet: snippet.into(),
134        }
135    }
136}
137
138/// A pluggable GitHub-search backend.
139///
140/// The default [`OctocrabGithubSearch`] hits the real API. Tests inject a stub
141/// so the tool's arg-parsing + formatting can be exercised offline.
142#[async_trait]
143pub trait GithubSearchBackend: Send + Sync {
144    /// Run a search of `kind` for `query` (already scoped), up to `k` results.
145    ///
146    /// # Errors
147    /// Returns an error if the upstream GitHub call fails (e.g. a 403 rate
148    /// limit).
149    async fn search(
150        &self,
151        query: &str,
152        kind: GithubSearchKind,
153        k: usize,
154    ) -> anyhow::Result<Vec<GithubSearchResult>>;
155}
156
157/// Install the `ring` rustls `CryptoProvider` as the process default, once.
158///
159/// The workspace graph pulls in both `ring` and `aws-lc-rs`, so rustls 0.23
160/// cannot auto-pick a provider and panics on first TLS use. We pin `ring`.
161/// Idempotent — a second call (or a provider already installed elsewhere) is a
162/// no-op.
163fn ensure_crypto_provider() {
164    use std::sync::Once;
165    static INIT: Once = Once::new();
166    INIT.call_once(|| {
167        let _ = rustls::crypto::ring::default_provider().install_default();
168    });
169}
170
171/// The real backend: live GitHub search via `octocrab`.
172pub struct OctocrabGithubSearch {
173    auth: GithubAuth,
174}
175
176impl OctocrabGithubSearch {
177    /// Build the live backend over the given auth.
178    #[must_use]
179    pub fn new(auth: GithubAuth) -> Self {
180        Self { auth }
181    }
182
183    fn client(&self) -> anyhow::Result<octocrab::Octocrab> {
184        ensure_crypto_provider();
185        let mut builder = octocrab::Octocrab::builder();
186        builder = match &self.auth {
187            GithubAuth::Token(token) => builder.personal_token(token.clone()),
188            GithubAuth::AppInstallation {
189                app_id,
190                private_key,
191                ..
192            } => {
193                let key = jsonwebtoken::EncodingKey::from_rsa_pem(private_key.as_bytes())
194                    .map_err(|e| anyhow::anyhow!("GitHub App private key invalid: {e}"))?;
195                builder.app((*app_id).into(), key)
196            }
197            GithubAuth::Unauthenticated => builder,
198        };
199        let client = builder.build()?;
200        if let GithubAuth::AppInstallation {
201            installation_id, ..
202        } = &self.auth
203        {
204            return Ok(client.installation((*installation_id).into())?);
205        }
206        Ok(client)
207    }
208}
209
210#[async_trait]
211impl GithubSearchBackend for OctocrabGithubSearch {
212    async fn search(
213        &self,
214        query: &str,
215        kind: GithubSearchKind,
216        k: usize,
217    ) -> anyhow::Result<Vec<GithubSearchResult>> {
218        let client = self.client()?;
219        match kind {
220            GithubSearchKind::Code => {
221                let page = client
222                    .search()
223                    .code(query)
224                    .per_page(k as u8)
225                    .send()
226                    .await
227                    .map_err(|e| map_github_err(e, "code"))?;
228                Ok(page
229                    .items
230                    .into_iter()
231                    .map(|item| {
232                        GithubSearchResult::new(
233                            item.path.clone(),
234                            item.html_url.to_string(),
235                            format!(
236                                "{} in {}",
237                                item.name,
238                                item.repository.full_name.unwrap_or_default()
239                            ),
240                        )
241                    })
242                    .collect())
243            }
244            GithubSearchKind::Issues => {
245                let page = client
246                    .search()
247                    .issues_and_pull_requests(query)
248                    .per_page(k as u8)
249                    .send()
250                    .await
251                    .map_err(|e| map_github_err(e, "issues"))?;
252                Ok(page
253                    .items
254                    .into_iter()
255                    .map(|item| {
256                        let snippet = item.body.unwrap_or_default();
257                        let snippet: String = snippet.chars().take(200).collect();
258                        GithubSearchResult::new(item.title, item.html_url.to_string(), snippet)
259                    })
260                    .collect())
261            }
262        }
263    }
264}
265
266/// Map an octocrab error into a clearer message, surfacing rate limits.
267fn map_github_err(err: octocrab::Error, what: &str) -> anyhow::Error {
268    let msg = err.to_string();
269    if msg.contains("403") || msg.to_ascii_lowercase().contains("rate limit") {
270        anyhow::anyhow!("GitHub {what} search hit a rate limit (HTTP 403): {msg}")
271    } else {
272        anyhow::anyhow!("GitHub {what} search failed: {msg}")
273    }
274}
275
276/// A [`Tool`] that runs a live GitHub search through a [`GithubSearchBackend`],
277/// scoped to a default `owner/repo`.
278pub struct GithubSearchTool {
279    backend: Arc<dyn GithubSearchBackend>,
280    owner: String,
281    repo: String,
282}
283
284impl GithubSearchTool {
285    /// Build the tool over an auth and a default `owner/repo` scope, using the
286    /// live [`OctocrabGithubSearch`] backend.
287    #[must_use]
288    pub fn new(auth: GithubAuth, owner: impl Into<String>, repo: impl Into<String>) -> Self {
289        Self::with_backend(Arc::new(OctocrabGithubSearch::new(auth)), owner, repo)
290    }
291
292    /// Build the tool over an explicit backend (tests inject a stub).
293    #[must_use]
294    pub fn with_backend(
295        backend: Arc<dyn GithubSearchBackend>,
296        owner: impl Into<String>,
297        repo: impl Into<String>,
298    ) -> Self {
299        Self {
300            backend,
301            owner: owner.into(),
302            repo: repo.into(),
303        }
304    }
305
306    /// Fold the default `repo:owner/name` scope into the user's query (unless
307    /// they already pinned a `repo:`/`org:`/`user:` qualifier). Pure — unit
308    /// tested offline.
309    fn scoped_query(&self, query: &str) -> String {
310        let lower = query.to_ascii_lowercase();
311        if lower.contains("repo:") || lower.contains("org:") || lower.contains("user:") {
312            query.to_string()
313        } else {
314            format!("{query} repo:{}/{}", self.owner, self.repo)
315        }
316    }
317}
318
319#[async_trait]
320impl Tool for GithubSearchTool {
321    fn schema(&self) -> ToolSchema {
322        ToolSchema {
323            name: "github_search".to_string(),
324            description: format!(
325                "Search GitHub live for code or issues — fresh lookups beyond the indexed \
326                 knowledge snapshot (newly-merged code, recent issues/PRs). Defaults to scoping \
327                 results to the {}/{} repository; include a `repo:owner/name` qualifier in the \
328                 query to search elsewhere. Use knowledge_search for already-indexed content; use \
329                 this when you need the current state of the codebase or issue tracker. Returns \
330                 results with title, URL, and snippet.",
331                self.owner, self.repo
332            ),
333            parameters: serde_json::json!({
334                "type": "object",
335                "properties": {
336                    "query": {
337                        "type": "string",
338                        "description": "The GitHub search query (GitHub search qualifiers allowed)."
339                    },
340                    "kind": {
341                        "type": "string",
342                        "enum": ["code", "issues"],
343                        "description": "Search source code ('code') or issues + PRs ('issues'). Defaults to 'code'."
344                    },
345                    "limit": {
346                        "type": "integer",
347                        "description": "Maximum number of results (default 5, max 20).",
348                        "minimum": 1,
349                        "maximum": 20
350                    }
351                },
352                "required": ["query"]
353            }),
354        }
355    }
356
357    async fn execute(&self, arguments: serde_json::Value) -> anyhow::Result<String> {
358        let query = arguments
359            .get("query")
360            .and_then(serde_json::Value::as_str)
361            .ok_or_else(|| anyhow::anyhow!("github_search requires a string 'query' argument"))?;
362
363        let kind =
364            GithubSearchKind::parse(arguments.get("kind").and_then(serde_json::Value::as_str));
365
366        let k = arguments
367            .get("limit")
368            .and_then(serde_json::Value::as_u64)
369            .map_or(DEFAULT_RESULTS, |n| (n as usize).clamp(1, MAX_RESULTS));
370
371        let scoped = self.scoped_query(query);
372        let results = self.backend.search(&scoped, kind, k).await?;
373
374        if results.is_empty() {
375            return Ok(format!(
376                "No GitHub {} results found for {scoped:?}.",
377                kind.label()
378            ));
379        }
380
381        let mut out = format!(
382            "Found {} GitHub {} result(s) for {scoped:?}:\n",
383            results.len(),
384            kind.label()
385        );
386        for (i, r) in results.iter().enumerate() {
387            out.push_str(&format!(
388                "{}. {} — {}\n   {}\n",
389                i + 1,
390                r.title,
391                r.url,
392                r.snippet
393            ));
394        }
395        Ok(out)
396    }
397
398    fn is_read_only(&self) -> bool {
399        true
400    }
401}
402
403#[cfg(test)]
404mod tests {
405    use super::*;
406
407    /// A stub backend recording the query/kind it was called with and returning
408    /// canned hits — proves the tool's parse → scope → format path offline.
409    struct StubBackend {
410        last: std::sync::Mutex<Option<(String, GithubSearchKind, usize)>>,
411    }
412
413    impl StubBackend {
414        fn new() -> Self {
415            Self {
416                last: std::sync::Mutex::new(None),
417            }
418        }
419    }
420
421    #[async_trait]
422    impl GithubSearchBackend for StubBackend {
423        async fn search(
424            &self,
425            query: &str,
426            kind: GithubSearchKind,
427            k: usize,
428        ) -> anyhow::Result<Vec<GithubSearchResult>> {
429            *self.last.lock().unwrap() = Some((query.to_string(), kind, k));
430            Ok((0..k.min(2))
431                .map(|i| {
432                    GithubSearchResult::new(
433                        format!("result-{i}.rs"),
434                        format!("https://github.com/acme/app/blob/main/result-{i}.rs"),
435                        format!("snippet {i}"),
436                    )
437                })
438                .collect())
439        }
440    }
441
442    fn tool() -> (GithubSearchTool, Arc<StubBackend>) {
443        let backend = Arc::new(StubBackend::new());
444        let tool = GithubSearchTool::with_backend(backend.clone(), "acme", "app");
445        (tool, backend)
446    }
447
448    #[test]
449    fn kind_parses_and_defaults_to_code() {
450        assert_eq!(GithubSearchKind::parse(None), GithubSearchKind::Code);
451        assert_eq!(
452            GithubSearchKind::parse(Some("code")),
453            GithubSearchKind::Code
454        );
455        assert_eq!(
456            GithubSearchKind::parse(Some("issues")),
457            GithubSearchKind::Issues
458        );
459        assert_eq!(
460            GithubSearchKind::parse(Some("issue")),
461            GithubSearchKind::Issues
462        );
463        assert_eq!(
464            GithubSearchKind::parse(Some("PRs")),
465            GithubSearchKind::Issues
466        );
467        // Unknown → code.
468        assert_eq!(
469            GithubSearchKind::parse(Some("nonsense")),
470            GithubSearchKind::Code
471        );
472    }
473
474    #[test]
475    fn scoped_query_appends_repo_scope() {
476        let (tool, _) = tool();
477        assert_eq!(tool.scoped_query("foo bar"), "foo bar repo:acme/app");
478    }
479
480    #[test]
481    fn scoped_query_respects_explicit_repo_qualifier() {
482        let (tool, _) = tool();
483        assert_eq!(
484            tool.scoped_query("foo repo:other/thing"),
485            "foo repo:other/thing"
486        );
487        assert_eq!(tool.scoped_query("bar org:acme"), "bar org:acme");
488    }
489
490    #[tokio::test]
491    async fn execute_requires_query() {
492        let (tool, _) = tool();
493        let err = tool
494            .execute(serde_json::json!({ "kind": "code" }))
495            .await
496            .expect_err("missing query should error");
497        assert!(err.to_string().contains("query"));
498    }
499
500    #[tokio::test]
501    async fn execute_scopes_query_and_formats_results() {
502        let (tool, backend) = tool();
503        let out = tool
504            .execute(serde_json::json!({ "query": "fn main", "limit": 2 }))
505            .await
506            .expect("execute");
507
508        // The backend saw the scoped query, code kind, k=2.
509        let (q, kind, k) = backend
510            .last
511            .lock()
512            .unwrap()
513            .clone()
514            .expect("backend called");
515        assert_eq!(q, "fn main repo:acme/app");
516        assert_eq!(kind, GithubSearchKind::Code);
517        assert_eq!(k, 2);
518
519        // The output renders the hits.
520        assert!(out.contains("Found 2 GitHub code result(s)"), "got: {out}");
521        assert!(out.contains("result-0.rs"), "got: {out}");
522        assert!(
523            out.contains("https://github.com/acme/app/blob/main/result-1.rs"),
524            "got: {out}"
525        );
526        assert!(tool.is_read_only());
527    }
528
529    #[tokio::test]
530    async fn execute_routes_issues_kind() {
531        let (tool, backend) = tool();
532        let out = tool
533            .execute(serde_json::json!({ "query": "login broken", "kind": "issues" }))
534            .await
535            .expect("execute");
536        let (_, kind, _) = backend.last.lock().unwrap().clone().unwrap();
537        assert_eq!(kind, GithubSearchKind::Issues);
538        assert!(out.contains("GitHub issues result(s)"), "got: {out}");
539    }
540
541    #[tokio::test]
542    async fn execute_clamps_limit_to_max() {
543        let (tool, backend) = tool();
544        tool.execute(serde_json::json!({ "query": "x", "limit": 9999 }))
545            .await
546            .expect("execute");
547        let (_, _, k) = backend.last.lock().unwrap().clone().unwrap();
548        assert_eq!(k, MAX_RESULTS);
549    }
550
551    #[tokio::test]
552    async fn empty_results_render_a_clear_message() {
553        struct Empty;
554        #[async_trait]
555        impl GithubSearchBackend for Empty {
556            async fn search(
557                &self,
558                _q: &str,
559                _kind: GithubSearchKind,
560                _k: usize,
561            ) -> anyhow::Result<Vec<GithubSearchResult>> {
562                Ok(vec![])
563            }
564        }
565        let tool = GithubSearchTool::with_backend(Arc::new(Empty), "acme", "app");
566        let out = tool
567            .execute(serde_json::json!({ "query": "zzz" }))
568            .await
569            .unwrap();
570        assert!(out.contains("No GitHub code results found"), "got: {out}");
571    }
572
573    #[test]
574    fn auth_debug_never_leaks_secrets() {
575        let token = GithubAuth::Token("ghp_secretvalue".to_string());
576        assert!(!format!("{token:?}").contains("secretvalue"));
577    }
578
579    /// Live GitHub search — only with `SMOOTH_AGENT_E2E=1` (network). Run:
580    /// `SMOOTH_AGENT_E2E=1 cargo test -p smooai-smooth-operator \
581    ///    github_search::tests::live_search -- --ignored --nocapture`
582    #[tokio::test]
583    #[ignore = "network: gated on SMOOTH_AGENT_E2E"]
584    async fn live_search() {
585        if std::env::var("SMOOTH_AGENT_E2E").as_deref() != Ok("1") {
586            eprintln!("skipping live GitHub search: set SMOOTH_AGENT_E2E=1 to run");
587            return;
588        }
589        let auth = std::env::var("GITHUB_TOKEN")
590            .map(GithubAuth::Token)
591            .unwrap_or(GithubAuth::Unauthenticated);
592        let tool = GithubSearchTool::new(auth, "rust-lang", "rust");
593        let out = tool
594            .execute(serde_json::json!({ "query": "fn main", "kind": "code", "limit": 3 }))
595            .await
596            .expect("live search");
597        eprintln!("{out}");
598        assert!(out.contains("github.com"), "expected GitHub URLs: {out}");
599    }
600}