Skip to main content

doiget_cli/commands/
search.rs

1//! `doiget search <query>` subcommand.
2//!
3//! Two scopes, one command (ADR-0031 D5):
4//!
5//! - **external** (default) — discovery search over OpenAlex
6//!   `/works?search=` via [`doiget_core::discovery::paper_search`]. Turns
7//!   a topic into ranked candidate papers (title / abstract / year /
8//!   venue / citations / OA status / DOI) for triage *before* any PDF is
9//!   fetched. Tier-1 OA metadata, always-on, ships in the default
10//!   `oa-only` binary; no `DOIGET_ENABLE_OPENALEX` gate.
11//! - **local** (`--local`) — the legacy substring scan over
12//!   `<store-root>/.metadata/*.toml` via
13//!   [`FsStore::search`](doiget_core::store::FsStore). Re-finds papers
14//!   already in the store; offline.
15//!
16//! `--local` and `--external` are mutually exclusive; omitting both means
17//! external. Both scopes share one `--mode json` envelope —
18//! `{ "scope": "external" | "local", "query": "...", "count": N,
19//! "results": [...] }` — with a scope-dependent `results[]` element
20//! schema. The external scope additionally carries `"total_results"` (the
21//! upstream OpenAlex match count, which may exceed `count`).
22
23use std::io::Write;
24
25use anyhow::{Context, Result};
26
27use doiget_core::discovery::{paper_search, PaperSearchQuery, PaperSearchResults, SearchSort};
28use doiget_core::store::{EntryInfo, FsStore, Store};
29use doiget_core::ErrorCode;
30
31use super::fetch::{cli_exit_code, CliExit, FetchHarness};
32use super::output::OutputMode;
33use super::resolve_store_root;
34
35/// Phase 1 default cap on the number of returned **local** rows. Picked to
36/// match the "small CLI table" feel — large enough to be useful for an
37/// ad-hoc `doiget search foo --local`, small enough that an unbounded scan
38/// over a pathological store still terminates promptly.
39const LOCAL_DEFAULT_LIMIT: usize = 50;
40
41/// Format string for [`chrono::DateTime`] columns. RFC3339-shaped, UTC, no
42/// fractional seconds — identical to the [`list_recent`](super::list_recent)
43/// table so downstream pipelines can treat both outputs uniformly.
44const FETCHED_AT_FMT: &str = "%Y-%m-%dT%H:%M:%SZ";
45
46/// Production OpenAlex API base. Overridable via `DOIGET_OPENALEX_BASE`
47/// (test wiremock origin), mirroring the `graph` subcommand.
48const OPENALEX_DEFAULT_BASE: &str = "https://api.openalex.org";
49
50/// `--sort` choices for external discovery. Maps 1:1 onto
51/// [`SearchSort`]; kept CLI-local so `doiget-core` carries no `clap` dep.
52#[derive(Clone, Debug, PartialEq, Eq, clap::ValueEnum)]
53pub enum SortArg {
54    /// Best textual match first (OpenAlex `relevance_score:desc`).
55    Relevance,
56    /// Most-cited first (`cited_by_count:desc`).
57    Cited,
58    /// Newest first (`publication_date:desc`).
59    Recent,
60}
61
62impl From<SortArg> for SearchSort {
63    fn from(s: SortArg) -> Self {
64        match s {
65            SortArg::Relevance => SearchSort::Relevance,
66            SortArg::Cited => SearchSort::Cited,
67            SortArg::Recent => SearchSort::Recent,
68        }
69    }
70}
71
72/// External-discovery flag bundle (everything except `query` / `mode`).
73/// Bundled so the `main.rs` dispatch arm and [`run`] stay readable.
74#[derive(Debug, Clone)]
75pub struct ExternalArgs {
76    /// Max results; validated to `1..=200` (OpenAlex `per-page` ceiling) by
77    /// `PaperSearchQuery::validate` — an out-of-range value is rejected,
78    /// not silently clamped.
79    pub limit: usize,
80    /// Inclusive lower publication-year bound.
81    pub from_year: Option<i32>,
82    /// Inclusive upper publication-year bound.
83    pub to_year: Option<i32>,
84    /// Restrict to open-access works.
85    pub oa_only: bool,
86    /// Only works cited strictly more than this many times.
87    pub min_citations: Option<u64>,
88    /// Author name to filter by (resolved to an OpenAlex author ID).
89    pub author: Option<String>,
90    /// Venue / journal name to filter by (resolved to an OpenAlex source ID).
91    pub venue: Option<String>,
92    /// Publisher name to filter by (resolved to an OpenAlex publisher ID).
93    pub publisher: Option<String>,
94    /// Result ordering.
95    pub sort: SortArg,
96}
97
98/// Stderr sink for `docs/ERRORS.md` §3 human-error lines (mirrors the
99/// `print_err` helper in `commands::fetch` / `commands::graph`).
100#[allow(clippy::print_stderr)]
101fn print_err(args: std::fmt::Arguments<'_>) {
102    eprintln!("{args}");
103}
104
105/// Run the `search` subcommand.
106///
107/// `local` selects the store scan; otherwise external discovery runs
108/// (the default; `--external` is its explicit form and is already
109/// resolved away by clap's `conflicts_with`). `ext` carries the
110/// external-only flags and is ignored on the local path.
111///
112/// # Errors
113///
114/// Propagates store-open / scan failures (local) or surfaces a typed
115/// [`ErrorCode`] as a process exit code (external); an empty query is a
116/// usage error.
117pub async fn run(query: String, local: bool, ext: ExternalArgs, mode: OutputMode) -> Result<()> {
118    if query.trim().is_empty() {
119        anyhow::bail!("search query is empty");
120    }
121    if local {
122        run_local(&query, mode)
123    } else {
124        run_external(&query, ext, mode).await
125    }
126}
127
128/// Local-store substring scan (legacy behaviour, now behind `--local`).
129fn run_local(query: &str, mode: OutputMode) -> Result<()> {
130    let store_root = resolve_store_root()?;
131    let store = FsStore::new(store_root)?;
132    let entries = store
133        .search(query, LOCAL_DEFAULT_LIMIT)
134        .with_context(|| format!("search failed for query {query:?}"))?;
135
136    if mode == OutputMode::Quiet {
137        return Ok(());
138    }
139
140    let stdout = std::io::stdout();
141    let mut out = stdout.lock();
142    if mode == OutputMode::Json {
143        write_json(&mut out, &local_envelope(query, &entries))?;
144        return Ok(());
145    }
146    writeln!(out, "safekey\tyear\ttitle\tfetched_at")
147        .context("failed to write search header to stdout")?;
148    for e in entries {
149        let year = dash_or(e.year);
150        let fetched = e
151            .fetched_at
152            .map(|t| t.format(FETCHED_AT_FMT).to_string())
153            .unwrap_or_else(|| "-".into());
154        writeln!(
155            out,
156            "{}\t{}\t{}\t{}",
157            e.safekey.as_str(),
158            year,
159            e.title,
160            fetched
161        )
162        .context("failed to write search row to stdout")?;
163    }
164    Ok(())
165}
166
167/// External OpenAlex discovery search (the default scope).
168async fn run_external(query: &str, ext: ExternalArgs, mode: OutputMode) -> Result<()> {
169    let q = PaperSearchQuery {
170        query: query.to_string(),
171        limit: ext.limit,
172        from_year: ext.from_year,
173        to_year: ext.to_year,
174        oa_only: ext.oa_only,
175        min_citations: ext.min_citations,
176        author: ext.author,
177        venue: ext.venue,
178        publisher: ext.publisher,
179        sort: ext.sort.into(),
180    };
181    // Boundary validation (limit range, inverted year range) lives in
182    // `PaperSearchQuery::validate` so the CLI and the MCP tool cannot drift.
183    q.validate().map_err(|m| anyhow::anyhow!("{m}"))?;
184
185    let base = resolve_openalex_base()?;
186    // Leave `mailto` unset when no contact email is configured: send a real
187    // address (polite pool) or none, never a non-routable placeholder. The
188    // empty string is skipped by `build_search_url` / `resolve_entity_id`.
189    let contact_email = std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_default();
190
191    let harness = FetchHarness::from_env().context("building fetch harness")?;
192    harness
193        .log_session_start(Some(query))
194        .context("logging session start")?;
195    let ctx = harness.fetch_context();
196
197    let outcome = paper_search(&base, &contact_email, &q, &ctx).await;
198    harness.log_session_end(outcome.is_ok(), Some(query));
199
200    let results = match outcome {
201        Ok(r) => r,
202        Err(e) => {
203            let code = ErrorCode::from(&e);
204            print_err(format_args!("error[{}]: {e}", code.as_wire()));
205            return Err(anyhow::Error::new(CliExit(cli_exit_code(code))));
206        }
207    };
208
209    if mode == OutputMode::Quiet {
210        return Ok(());
211    }
212
213    let stdout = std::io::stdout();
214    let mut out = stdout.lock();
215    if mode == OutputMode::Json {
216        write_json(&mut out, &external_envelope(query, &results))?;
217        return Ok(());
218    }
219
220    // Human table: surface "interesting" signals (citations) first, then
221    // year / OA / DOI / title. Tab-separated, `cut(1)`-compatible.
222    writeln!(out, "cited_by\tyear\toa\tdoi\ttitle")
223        .context("failed to write search header to stdout")?;
224    for hit in &results.results {
225        let year = dash_or(hit.year);
226        let oa = hit.oa_status.as_deref().unwrap_or("-");
227        let doi = hit.doi.as_deref().unwrap_or("-");
228        writeln!(
229            out,
230            "{}\t{}\t{}\t{}\t{}",
231            hit.cited_by_count, year, oa, doi, hit.title
232        )
233        .context("failed to write search row to stdout")?;
234    }
235    Ok(())
236}
237
238/// Resolve the OpenAlex base URL: `DOIGET_OPENALEX_BASE` override (tests)
239/// or the production default.
240fn resolve_openalex_base() -> Result<url::Url> {
241    let raw =
242        std::env::var("DOIGET_OPENALEX_BASE").unwrap_or_else(|_| OPENALEX_DEFAULT_BASE.to_string());
243    url::Url::parse(&raw).with_context(|| format!("DOIGET_OPENALEX_BASE is not a URL: {raw}"))
244}
245
246/// Build the local-scan `--mode json` envelope (ADR-0031 D5):
247/// `{ scope: "local", query, count, results }`. The `results[]` element is
248/// the legacy `EntryInfo` shape, unchanged.
249fn local_envelope(query: &str, entries: &[EntryInfo]) -> serde_json::Value {
250    serde_json::json!({
251        "scope": "local",
252        "query": query,
253        "count": entries.len(),
254        "results": entries,
255    })
256}
257
258/// Build the external-discovery `--mode json` envelope (ADR-0031 D5):
259/// `{ scope, query, total_results, count, results }`. Extracted as a pure
260/// function so the wire shape is unit-testable without capturing stdout.
261fn external_envelope(query: &str, results: &PaperSearchResults) -> serde_json::Value {
262    serde_json::json!({
263        "scope": "external",
264        "query": query,
265        "total_results": results.total_results,
266        "count": results.results.len(),
267        "results": results.results,
268    })
269}
270
271/// Pretty-serialize a JSON value and write it as one line to `out`. Shared
272/// by the local and external `--mode json` paths.
273fn write_json(out: &mut impl Write, value: &serde_json::Value) -> Result<()> {
274    let s = serde_json::to_string_pretty(value).context("failed to serialize search JSON")?;
275    writeln!(out, "{s}").context("failed to write search JSON to stdout")
276}
277
278/// Render an optional value for a human-table cell: its `Display`, or `-`.
279fn dash_or<T: std::fmt::Display>(v: Option<T>) -> String {
280    v.map(|x| x.to_string()).unwrap_or_else(|| "-".into())
281}
282
283#[cfg(test)]
284#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
285mod tests {
286    use super::*;
287    use doiget_core::discovery::{DiscoverySource, PaperHit};
288
289    fn hit() -> PaperHit {
290        PaperHit {
291            doi: Some("10.1234/x".to_string()),
292            openalex_id: "W1".to_string(),
293            arxiv: None,
294            title: "T".to_string(),
295            authors: vec!["A".to_string()],
296            year: Some(2024),
297            venue: Some("V".to_string()),
298            abstract_: Some("abs".to_string()),
299            cited_by_count: 3,
300            oa_status: Some("gold".to_string()),
301            source: DiscoverySource::OpenAlex,
302        }
303    }
304
305    #[test]
306    fn external_envelope_has_scope_total_and_results() {
307        let results = PaperSearchResults {
308            results: vec![hit()],
309            total_results: Some(4012),
310        };
311        let v = external_envelope("spin glass", &results);
312        assert_eq!(v["scope"], "external");
313        assert_eq!(v["query"], "spin glass");
314        assert_eq!(v["total_results"], 4012);
315        assert_eq!(v["count"], 1);
316        assert_eq!(v["results"][0]["openalex_id"], "W1");
317        assert_eq!(v["results"][0]["abstract"], "abs");
318    }
319
320    #[test]
321    fn sort_arg_lowers_to_core() {
322        assert_eq!(SearchSort::from(SortArg::Relevance), SearchSort::Relevance);
323        assert_eq!(SearchSort::from(SortArg::Cited), SearchSort::Cited);
324        assert_eq!(SearchSort::from(SortArg::Recent), SearchSort::Recent);
325    }
326
327    #[test]
328    fn local_envelope_has_local_scope_and_count() {
329        let v = local_envelope("quantum", &[]);
330        assert_eq!(v["scope"], "local");
331        assert_eq!(v["query"], "quantum");
332        assert_eq!(v["count"], 0);
333        assert!(v["results"].as_array().expect("results array").is_empty());
334        // The local envelope must NOT carry the external-only field.
335        assert!(v.get("total_results").is_none());
336    }
337
338    /// `ExternalArgs` with defaults; override per test.
339    fn ext(limit: usize, from_year: Option<i32>, to_year: Option<i32>) -> ExternalArgs {
340        ExternalArgs {
341            limit,
342            from_year,
343            to_year,
344            oa_only: false,
345            min_citations: None,
346            author: None,
347            venue: None,
348            publisher: None,
349            sort: SortArg::Relevance,
350        }
351    }
352
353    // These validations fire BEFORE any network / harness construction, so
354    // the calls error without touching the filesystem or OpenAlex.
355
356    #[tokio::test]
357    async fn external_rejects_limit_below_1() {
358        let err = run("q".into(), false, ext(0, None, None), OutputMode::Quiet)
359            .await
360            .expect_err("limit 0 must be rejected");
361        assert!(err.to_string().contains("limit"), "got: {err}");
362    }
363
364    #[tokio::test]
365    async fn external_rejects_limit_above_200() {
366        let err = run("q".into(), false, ext(201, None, None), OutputMode::Quiet)
367            .await
368            .expect_err("limit 201 must be rejected");
369        assert!(err.to_string().contains("limit"), "got: {err}");
370    }
371
372    #[tokio::test]
373    async fn external_rejects_inverted_year_range() {
374        let err = run(
375            "q".into(),
376            false,
377            ext(25, Some(2025), Some(2010)),
378            OutputMode::Quiet,
379        )
380        .await
381        .expect_err("from_year > to_year must be rejected");
382        assert!(err.to_string().contains("is after"), "got: {err}");
383    }
384}