Skip to main content

doiget_cli/commands/
search.rs

1//! `doiget search <query>` subcommand.
2//!
3//! Two scopes, one command (ADR-0031 D5):
4//!
5//! - **external** (default) — discovery search over OpenAlex
6//!   `/works?search=` via [`doiget_core::discovery::paper_search`]. Turns
7//!   a topic into ranked candidate papers (title / abstract / year /
8//!   venue / citations / OA status / DOI) for triage *before* any PDF is
9//!   fetched. Tier-1 OA metadata, always-on, ships in the default
10//!   `oa-only` binary; no `DOIGET_ENABLE_OPENALEX` gate.
11//! - **local** (`--local`) — the legacy substring scan over
12//!   `<store-root>/.metadata/*.toml` via
13//!   [`FsStore::search`](doiget_core::store::FsStore). Re-finds papers
14//!   already in the store; offline.
15//!
16//! `--local` and `--external` are mutually exclusive; omitting both means
17//! external. Both scopes share one `--mode json` envelope —
18//! `{ "scope": "external" | "local", "query": "...", "count": N,
19//! "results": [...] }` — with a scope-dependent `results[]` element
20//! schema. The external scope additionally carries `"total_results"` (the
21//! upstream OpenAlex match count, which may exceed `count`).
22
23use std::io::Write;
24
25use anyhow::{Context, Result};
26
27use doiget_core::discovery::{paper_search, PaperSearchQuery, PaperSearchResults, SearchSort};
28use doiget_core::store::{EntryInfo, FsStore, Store};
29use doiget_core::ErrorCode;
30
31use super::fetch::{cli_exit_code, CliExit, FetchHarness};
32use super::output::OutputMode;
33use super::resolve_store_root;
34
35/// Phase 1 default cap on the number of returned **local** rows. Picked to
36/// match the "small CLI table" feel — large enough to be useful for an
37/// ad-hoc `doiget search foo --local`, small enough that an unbounded scan
38/// over a pathological store still terminates promptly.
39const LOCAL_DEFAULT_LIMIT: usize = 50;
40
41/// Format string for [`chrono::DateTime`] columns. RFC3339-shaped, UTC, no
42/// fractional seconds — identical to the [`list_recent`](super::list_recent)
43/// table so downstream pipelines can treat both outputs uniformly.
44const FETCHED_AT_FMT: &str = "%Y-%m-%dT%H:%M:%SZ";
45
46/// Production OpenAlex API base. Overridable via `DOIGET_OPENALEX_BASE`
47/// (test wiremock origin), mirroring the `graph` subcommand.
48const OPENALEX_DEFAULT_BASE: &str = "https://api.openalex.org";
49
50/// `--sort` choices for external discovery. Maps 1:1 onto
51/// [`SearchSort`]; kept CLI-local so `doiget-core` carries no `clap` dep.
52#[derive(Clone, Debug, Default, PartialEq, Eq, clap::ValueEnum)]
53pub enum SortArg {
54    /// Best textual match first (OpenAlex `relevance_score:desc`).
55    ///
56    /// The only sort: `cited` / `recent` were removed (#290) — over
57    /// OpenAlex's loose free-text match they float off-topic papers to the
58    /// top. Use `--min-fwci` / `--min-percentile` / `--from-year` to
59    /// surface "important / recent" results as FILTERS instead.
60    #[default]
61    Relevance,
62}
63
64impl From<SortArg> for SearchSort {
65    fn from(s: SortArg) -> Self {
66        match s {
67            SortArg::Relevance => SearchSort::Relevance,
68        }
69    }
70}
71
72/// External-discovery flag bundle (everything except `query` / `mode`).
73/// Bundled so the `main.rs` dispatch arm and [`run`] stay readable.
74#[derive(Debug, Clone)]
75pub struct ExternalArgs {
76    /// Max results; validated to `1..=200` (OpenAlex `per-page` ceiling) by
77    /// `PaperSearchQuery::validate` — an out-of-range value is rejected,
78    /// not silently clamped.
79    pub limit: usize,
80    /// Inclusive lower publication-year bound.
81    pub from_year: Option<i32>,
82    /// Inclusive upper publication-year bound.
83    pub to_year: Option<i32>,
84    /// Restrict to open-access works.
85    pub oa_only: bool,
86    /// Only works cited strictly more than this many times.
87    pub min_citations: Option<u64>,
88    /// Minimum field-and-year-normalized impact (FWCI) floor (#290).
89    pub min_fwci: Option<f64>,
90    /// Minimum within-cohort citation percentile, 0–100 (#290).
91    pub min_percentile: Option<u8>,
92    /// Author name to filter by (resolved to an OpenAlex author ID).
93    pub author: Option<String>,
94    /// Venue / journal name to filter by (resolved to an OpenAlex source ID).
95    pub venue: Option<String>,
96    /// Publisher name to filter by (resolved to an OpenAlex publisher ID).
97    pub publisher: Option<String>,
98    /// Result ordering.
99    pub sort: SortArg,
100}
101
102/// Stderr sink for `docs/ERRORS.md` §3 human-error lines (mirrors the
103/// `print_err` helper in `commands::fetch` / `commands::graph`).
104#[allow(clippy::print_stderr)]
105fn print_err(args: std::fmt::Arguments<'_>) {
106    eprintln!("{args}");
107}
108
109/// Run the `search` subcommand.
110///
111/// `local` selects the store scan; otherwise external discovery runs
112/// (the default; `--external` is its explicit form and is already
113/// resolved away by clap's `conflicts_with`). `ext` carries the
114/// external-only flags and is ignored on the local path.
115///
116/// # Errors
117///
118/// Propagates store-open / scan failures (local) or surfaces a typed
119/// [`ErrorCode`] as a process exit code (external); an empty query is a
120/// usage error.
121pub async fn run(
122    query: String,
123    local: bool,
124    ext: ExternalArgs,
125    mode: OutputMode,
126    quiet_was_explicit: bool,
127) -> Result<()> {
128    if query.trim().is_empty() {
129        anyhow::bail!("search query is empty");
130    }
131    if local {
132        run_local(&query, mode, quiet_was_explicit)
133    } else {
134        run_external(&query, ext, mode, quiet_was_explicit).await
135    }
136}
137
138/// Local-store substring scan (legacy behaviour, now behind `--local`).
139fn run_local(query: &str, mode: OutputMode, quiet_was_explicit: bool) -> Result<()> {
140    let store_root = resolve_store_root()?;
141    let store = FsStore::new(store_root)?;
142    let entries = store
143        .search(query, LOCAL_DEFAULT_LIMIT)
144        .with_context(|| format!("search failed for query {query:?}"))?;
145
146    // Artifact-class (ADR-0017 Amendment 2 / #301): suppress only on
147    // explicit Quiet; the non-TTY implicit fallback still emits.
148    if mode == OutputMode::Quiet && quiet_was_explicit {
149        return Ok(());
150    }
151
152    let stdout = std::io::stdout();
153    let mut out = stdout.lock();
154    if mode == OutputMode::Json {
155        write_json(&mut out, &local_envelope(query, &entries))?;
156        return Ok(());
157    }
158    writeln!(out, "safekey\tyear\ttitle\tfetched_at")
159        .context("failed to write search header to stdout")?;
160    for e in entries {
161        let year = dash_or(e.year);
162        let fetched = e
163            .fetched_at
164            .map(|t| t.format(FETCHED_AT_FMT).to_string())
165            .unwrap_or_else(|| "-".into());
166        writeln!(
167            out,
168            "{}\t{}\t{}\t{}",
169            e.safekey.as_str(),
170            year,
171            e.title,
172            fetched
173        )
174        .context("failed to write search row to stdout")?;
175    }
176    Ok(())
177}
178
179/// External OpenAlex discovery search (the default scope).
180async fn run_external(
181    query: &str,
182    ext: ExternalArgs,
183    mode: OutputMode,
184    quiet_was_explicit: bool,
185) -> Result<()> {
186    let q = PaperSearchQuery {
187        query: query.to_string(),
188        limit: ext.limit,
189        from_year: ext.from_year,
190        to_year: ext.to_year,
191        oa_only: ext.oa_only,
192        min_citations: ext.min_citations,
193        min_fwci: ext.min_fwci,
194        min_percentile: ext.min_percentile,
195        author: ext.author,
196        venue: ext.venue,
197        publisher: ext.publisher,
198        sort: ext.sort.into(),
199    };
200    // Boundary validation (limit range, inverted year range) lives in
201    // `PaperSearchQuery::validate` so the CLI and the MCP tool cannot drift.
202    q.validate().map_err(|m| anyhow::anyhow!("{m}"))?;
203
204    let base = resolve_openalex_base()?;
205    // Leave `mailto` unset when no contact email is configured: send a real
206    // address (polite pool) or none, never a non-routable placeholder. The
207    // empty string is skipped by `build_search_url` / `resolve_entity_id`.
208    let contact_email = std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_default();
209
210    let harness = FetchHarness::from_env().context("building fetch harness")?;
211    harness
212        .log_session_start(Some(query))
213        .context("logging session start")?;
214    let ctx = harness.fetch_context();
215
216    let outcome = paper_search(&base, &contact_email, &q, &ctx).await;
217    harness.log_session_end(outcome.is_ok(), Some(query));
218
219    let results = match outcome {
220        Ok(r) => r,
221        Err(e) => {
222            let code = ErrorCode::from(&e);
223            print_err(format_args!("error[{}]: {e}", code.as_wire()));
224            return Err(anyhow::Error::new(CliExit(cli_exit_code(code))));
225        }
226    };
227
228    // Artifact-class (ADR-0017 Amendment 2 / #301): suppress only on
229    // explicit Quiet; the non-TTY implicit fallback still emits.
230    if mode == OutputMode::Quiet && quiet_was_explicit {
231        return Ok(());
232    }
233
234    let stdout = std::io::stdout();
235    let mut out = stdout.lock();
236    if mode == OutputMode::Json {
237        write_json(&mut out, &external_envelope(query, &results))?;
238        return Ok(());
239    }
240
241    // Human table: surface "interesting" signals (citations) first, then
242    // year / OA / DOI / title. Tab-separated, `cut(1)`-compatible.
243    writeln!(out, "cited_by\tyear\toa\tdoi\ttitle")
244        .context("failed to write search header to stdout")?;
245    for hit in &results.results {
246        let year = dash_or(hit.year);
247        let oa = hit.oa_status.as_deref().unwrap_or("-");
248        let doi = hit.doi.as_deref().unwrap_or("-");
249        writeln!(
250            out,
251            "{}\t{}\t{}\t{}\t{}",
252            hit.cited_by_count, year, oa, doi, hit.title
253        )
254        .context("failed to write search row to stdout")?;
255    }
256    Ok(())
257}
258
259/// Resolve the OpenAlex base URL: `DOIGET_OPENALEX_BASE` override (tests)
260/// or the production default.
261fn resolve_openalex_base() -> Result<url::Url> {
262    let raw =
263        std::env::var("DOIGET_OPENALEX_BASE").unwrap_or_else(|_| OPENALEX_DEFAULT_BASE.to_string());
264    url::Url::parse(&raw).with_context(|| format!("DOIGET_OPENALEX_BASE is not a URL: {raw}"))
265}
266
267/// Build the local-scan `--mode json` envelope (ADR-0031 D5):
268/// `{ scope: "local", query, count, results }`. The `results[]` element is
269/// the legacy `EntryInfo` shape, unchanged.
270fn local_envelope(query: &str, entries: &[EntryInfo]) -> serde_json::Value {
271    serde_json::json!({
272        "scope": "local",
273        "query": query,
274        "count": entries.len(),
275        "results": entries,
276    })
277}
278
279/// Build the external-discovery `--mode json` envelope (ADR-0031 D5):
280/// `{ scope, query, total_results, count, results }`. Extracted as a pure
281/// function so the wire shape is unit-testable without capturing stdout.
282fn external_envelope(query: &str, results: &PaperSearchResults) -> serde_json::Value {
283    serde_json::json!({
284        "scope": "external",
285        "query": query,
286        "total_results": results.total_results,
287        "count": results.results.len(),
288        "results": results.results,
289    })
290}
291
292/// Pretty-serialize a JSON value and write it as one line to `out`. Shared
293/// by the local and external `--mode json` paths.
294fn write_json(out: &mut impl Write, value: &serde_json::Value) -> Result<()> {
295    let s = serde_json::to_string_pretty(value).context("failed to serialize search JSON")?;
296    writeln!(out, "{s}").context("failed to write search JSON to stdout")
297}
298
299/// Render an optional value for a human-table cell: its `Display`, or `-`.
300fn dash_or<T: std::fmt::Display>(v: Option<T>) -> String {
301    v.map(|x| x.to_string()).unwrap_or_else(|| "-".into())
302}
303
304#[cfg(test)]
305#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
306mod tests {
307    use super::*;
308    use doiget_core::discovery::{DiscoverySource, PaperHit};
309
310    fn hit() -> PaperHit {
311        PaperHit {
312            doi: Some("10.1234/x".to_string()),
313            openalex_id: "W1".to_string(),
314            arxiv: None,
315            title: "T".to_string(),
316            authors: vec!["A".to_string()],
317            year: Some(2024),
318            venue: Some("V".to_string()),
319            abstract_: Some("abs".to_string()),
320            cited_by_count: 3,
321            oa_status: Some("gold".to_string()),
322            source: DiscoverySource::OpenAlex,
323        }
324    }
325
326    #[test]
327    fn external_envelope_has_scope_total_and_results() {
328        let results = PaperSearchResults {
329            results: vec![hit()],
330            total_results: Some(4012),
331        };
332        let v = external_envelope("spin glass", &results);
333        assert_eq!(v["scope"], "external");
334        assert_eq!(v["query"], "spin glass");
335        assert_eq!(v["total_results"], 4012);
336        assert_eq!(v["count"], 1);
337        assert_eq!(v["results"][0]["openalex_id"], "W1");
338        assert_eq!(v["results"][0]["abstract"], "abs");
339    }
340
341    #[test]
342    fn sort_arg_lowers_to_core() {
343        // Relevance is the only sort (#290); `cited` / `recent` were removed.
344        assert_eq!(SearchSort::from(SortArg::Relevance), SearchSort::Relevance);
345    }
346
347    #[test]
348    fn local_envelope_has_local_scope_and_count() {
349        let v = local_envelope("quantum", &[]);
350        assert_eq!(v["scope"], "local");
351        assert_eq!(v["query"], "quantum");
352        assert_eq!(v["count"], 0);
353        assert!(v["results"].as_array().expect("results array").is_empty());
354        // The local envelope must NOT carry the external-only field.
355        assert!(v.get("total_results").is_none());
356    }
357
358    /// `ExternalArgs` with defaults; override per test.
359    fn ext(limit: usize, from_year: Option<i32>, to_year: Option<i32>) -> ExternalArgs {
360        ExternalArgs {
361            limit,
362            from_year,
363            to_year,
364            oa_only: false,
365            min_citations: None,
366            min_fwci: None,
367            min_percentile: None,
368            author: None,
369            venue: None,
370            publisher: None,
371            sort: SortArg::Relevance,
372        }
373    }
374
375    // These validations fire BEFORE any network / harness construction, so
376    // the calls error without touching the filesystem or OpenAlex.
377
378    #[tokio::test]
379    async fn external_rejects_limit_below_1() {
380        let err = run(
381            "q".into(),
382            false,
383            ext(0, None, None),
384            OutputMode::Quiet,
385            true,
386        )
387        .await
388        .expect_err("limit 0 must be rejected");
389        assert!(err.to_string().contains("limit"), "got: {err}");
390    }
391
392    #[tokio::test]
393    async fn external_rejects_limit_above_200() {
394        let err = run(
395            "q".into(),
396            false,
397            ext(201, None, None),
398            OutputMode::Quiet,
399            true,
400        )
401        .await
402        .expect_err("limit 201 must be rejected");
403        assert!(err.to_string().contains("limit"), "got: {err}");
404    }
405
406    #[tokio::test]
407    async fn external_rejects_inverted_year_range() {
408        let err = run(
409            "q".into(),
410            false,
411            ext(25, Some(2025), Some(2010)),
412            OutputMode::Quiet,
413            true,
414        )
415        .await
416        .expect_err("from_year > to_year must be rejected");
417        assert!(err.to_string().contains("is after"), "got: {err}");
418    }
419}