doiget-core 0.7.0

//! External literature **discovery search** over OpenAlex `/works?search=`.
//!
//! This is the front half of the #281 research loop (`search → triage →
//! expand → fetch → read → map`). Unlike [`FsStore::search`](crate::store::FsStore)
//! (which re-finds papers already in the local store) and unlike the
//! citation `graph` walker, this module turns a free-text *topic* into a
//! ranked list of candidate papers — each carrying enough metadata
//! (title / abstract / year / venue / citation count / OA status / DOI)
//! for an agent to triage *before* any PDF is fetched.
//!
//! ## Capability tier (ADR-0031)
//!
//! Discovery search is **Tier 1 OA metadata, always-on**: there is no
//! `DOIGET_ENABLE_OPENALEX` gate and no Cargo-feature gate. It ships in
//! the default `oa-only` binary. The justification (ADR-0031 D1) is that
//! a bounded OpenAlex query is the same network-surface risk class as the
//! Crossref / Unpaywall calls Tier 1 already makes on every fetch:
//! read-only OA metadata, never paywalled, never a PDF.
//!
//! This is deliberately **distinct** from `crate::sources::openalex`
//! (the `#[cfg(feature = "metadata")]` enrichment / `referenced_works[]`
//! source used by `graph`, which stays Tier 2 behind
//! `DOIGET_ENABLE_OPENALEX`). The `Source` trait is `ref → FetchResult`;
//! search is `query → list`, so it does not fit that trait and lives here
//! as a free function reusing only the shared [`HttpClient`], rate
//! limiter, and provenance log via [`FetchContext`].
//!
//! ## Author / venue / publisher filters (ADR-0031 D5)
//!
//! OpenAlex filters authors / sources (venues) / publishers by **entity
//! ID**, not free text. So `paper_search` first resolves a supplied
//! `--author` / `--venue` / `--publisher` *name* to its OpenAlex ID via a
//! `?search=` lookup against `/authors`, `/sources`, `/publishers`, then
//! filters `/works` by `authorships.author.id` /
//! `primary_location.source.id` /
//! `primary_location.source.publisher_lineage`. The top hit is NOT taken
//! blindly: `select_entity` resolves only an unambiguous name (a single
//! hit, an exact case-insensitive name match, or a top hit that clearly
//! out-scores the runner-up); a name matching several entities with no
//! clear winner is a typed [`FetchError::Ambiguous`] listing the
//! candidates, and a name matching nothing is [`FetchError::NotFound`].
//! The filter is never silently dropped.
//!
//! ## Metadata-only contract (ADR-0031 D3)
//!
//! Every call here uses [`HttpClient::fetch_bytes`] (a JSON body),
//! **never** `fetch_pdf`, and never follows an OA URL. The abstract is
//! reconstructed from OpenAlex's `abstract_inverted_index`.
//!
//! [`HttpClient`]: crate::http::HttpClient
//! [`HttpClient::fetch_bytes`]: crate::http::HttpClient::fetch_bytes

use serde::Serialize;
use url::Url;

use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError};

/// Source key used for the per-source HTTP client + redirect allowlist.
///
/// Shares the `"openalex"` key with `crate::sources::openalex` so that
/// `crate::http::discovery_allowlist` (always compiled) and
/// `tier_2_allowlist` (always compiled, but only *called* by the CLI
/// under `#[cfg(feature = "citation")]`) register the same
/// `api.openalex.org` host under one key (an idempotent overwrite — see
/// ADR-0031 D2).
const SOURCE_KEY: &str = "openalex";

/// OpenAlex `select=` field list. Bounds the response payload to exactly
/// the top-level fields [`PaperHit`] needs; every entry here is a
/// top-level Work field (nested selection is not used).
const SELECT_FIELDS: &str = "id,doi,title,display_name,publication_year,\
cited_by_count,fwci,cited_by_percentile_year,abstract_inverted_index,authorships,\
primary_location,open_access,locations";

/// OpenAlex caps `per-page` at 200; requests above that are rejected by
/// the API. `build_search_url` clamps to this as defense-in-depth, but
/// the CLI rejects an out-of-range `--limit` up front (so the user is not
/// silently given fewer results than asked).
pub const MAX_PER_PAGE: usize = 200;

/// Default page size when the caller does not specify `--limit`.
pub const DEFAULT_LIMIT: usize = 25;

/// Ordering applied to the discovery result set.
///
/// **Relevance is the only sort** (issue #290). Verified against live
/// OpenAlex: every non-relevance sort (`cited_by_count`, `fwci`,
/// `publication_date`) over OpenAlex's loose full-text match floats
/// high-scoring *off-topic* papers to the top — they override the one
/// signal that enforces topicality. "Important / recent / high-quality" is
/// therefore expressed as **filters** (`min_fwci` / `min_percentile` /
/// `from_year`), which narrow the candidate set without discarding
/// relevance ordering. (Non-relevance sorting is only safe over an
/// already-topically-constrained set — not free-text `search`.)
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum SearchSort {
    /// OpenAlex `relevance_score:desc` — best textual match to `query`
    /// first. Only meaningful because a search term is always present.
    #[default]
    Relevance,
}

impl SearchSort {
    /// The OpenAlex `sort=` parameter value for this ordering.
    #[must_use]
    pub fn as_openalex(self) -> &'static str {
        match self {
            SearchSort::Relevance => "relevance_score:desc",
        }
    }
}

/// A discovery-search request: the free-text query plus triage filters.
///
/// Construct directly (all fields are public); the CLI maps its flags
/// onto this.
#[derive(Debug, Clone)]
pub struct PaperSearchQuery {
    /// Free-text topic query (e.g. "tropical tensor networks for spin
    /// glasses"). Must be non-empty; the caller is expected to reject
    /// empty input.
    pub query: String,
    /// Maximum number of results to return, bounded to `1..=200` (OpenAlex
    /// `per-page` ceiling). [`validate`](Self::validate) **rejects** an
    /// out-of-range value; `paper_search` itself only clamps it as
    /// defense-in-depth (see the function's caller-side-validation note).
    pub limit: usize,
    /// Inclusive lower bound on publication year (maps to OpenAlex
    /// `from_publication_date:<year>-01-01`).
    pub from_year: Option<i32>,
    /// Inclusive upper bound on publication year (maps to OpenAlex
    /// `to_publication_date:<year>-12-31`).
    pub to_year: Option<i32>,
    /// When `true`, restrict to open-access works (`is_oa:true`).
    pub oa_only: bool,
    /// Minimum citation count. Maps to OpenAlex `cited_by_count:>{n}`
    /// ("more than n"); the off-by-one versus "at least n" is documented
    /// on the CLI flag.
    pub min_citations: Option<u64>,
    /// Minimum field-and-year-normalized citation impact (FWCI). Maps to
    /// OpenAlex `fwci:>{f}` — an impact floor that, unlike sorting by
    /// citations, narrows the set without overriding relevance (#290).
    pub min_fwci: Option<f64>,
    /// Minimum within-cohort citation percentile (0–100). Maps to OpenAlex
    /// `cited_by_percentile_year.min:{p}` — "top-X% among same-year works";
    /// combined with `from_year` this is the "recent × already standing
    /// out" set (#290).
    pub min_percentile: Option<u8>,
    /// Author name to filter by. Resolved to an OpenAlex author ID via
    /// `/authors?search=` then applied as `authorships.author.id`.
    pub author: Option<String>,
    /// Venue / journal name to filter by. Resolved to an OpenAlex source
    /// ID via `/sources?search=` then applied as
    /// `primary_location.source.id`.
    pub venue: Option<String>,
    /// Publisher name to filter by. Resolved to an OpenAlex publisher ID
    /// via `/publishers?search=` then applied as
    /// `primary_location.source.publisher_lineage`.
    pub publisher: Option<String>,
    /// Result ordering.
    pub sort: SearchSort,
}

impl PaperSearchQuery {
    /// A bare query with [`DEFAULT_LIMIT`], no filters, relevance sort.
    #[must_use]
    pub fn new(query: impl Into<String>) -> Self {
        Self {
            query: query.into(),
            limit: DEFAULT_LIMIT,
            from_year: None,
            to_year: None,
            oa_only: false,
            min_citations: None,
            min_fwci: None,
            min_percentile: None,
            author: None,
            venue: None,
            publisher: None,
            sort: SearchSort::Relevance,
        }
    }

    /// Validate the request shape, returning a human-readable message on
    /// the first problem. This is the single source of truth for the
    /// boundary validation that both the CLI and the MCP tool apply
    /// (`paper_search` itself stays permissive — see its docs); keeping it
    /// here prevents the two surfaces from drifting.
    ///
    /// Checks: non-empty `query`, `limit` in `1..=`[`MAX_PER_PAGE`], a
    /// non-inverted `from_year`/`to_year` range, a finite non-negative
    /// `min_fwci`, and a `min_percentile` in `0..=100`.
    ///
    /// # Errors
    ///
    /// `Err(msg)` describing the first invalid field; `msg` is suitable for
    /// surfacing directly to a user / agent.
    pub fn validate(&self) -> Result<(), String> {
        if self.query.trim().is_empty() {
            return Err("search query is empty".to_string());
        }
        if !(1..=MAX_PER_PAGE).contains(&self.limit) {
            return Err(format!(
                "limit must be between 1 and {MAX_PER_PAGE} (got {})",
                self.limit
            ));
        }
        if let (Some(from), Some(to)) = (self.from_year, self.to_year) {
            if from > to {
                return Err(format!("from_year ({from}) is after to_year ({to})"));
            }
        }
        // `min_fwci` becomes a literal `fwci:>{f}` filter clause; a negative
        // or non-finite value would be a malformed OpenAlex request that the
        // API rejects (or silently ignores). Reject it here, at the same
        // boundary as the year range, rather than emit a bad filter (#290).
        if let Some(f) = self.min_fwci {
            if !f.is_finite() || f < 0.0 {
                return Err(format!(
                    "min_fwci must be a finite, non-negative number (got {f})"
                ));
            }
        }
        // The percentile is a 0–100 cohort rank; `u8` already excludes
        // negatives, but 101–255 would emit a `cited_by_percentile_year.min`
        // clause OpenAlex cannot satisfy (empty result, no error).
        if let Some(p) = self.min_percentile {
            if p > 100 {
                return Err(format!(
                    "min_percentile must be between 0 and 100 (got {p})"
                ));
            }
        }
        Ok(())
    }
}

/// Discovery backend that produced a [`PaperHit`].
///
/// PR1 has a single source; `#[non_exhaustive]` reserves room for future
/// Tier-1 discovery backends (e.g. Semantic Scholar) without a breaking
/// change, while the wire form stays the lowercase source name (so the
/// JSON shape is unchanged from the previous `&'static str` field).
#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
#[non_exhaustive]
pub enum DiscoverySource {
    /// OpenAlex `/works?search=`. Serializes to `"openalex"`.
    OpenAlex,
}

/// One candidate paper returned by discovery search.
///
/// All fields except `openalex_id` / `title` / `cited_by_count` /
/// `source` are `Option` because OpenAlex omits them for some records
/// (e.g. no DOI for a dataset, no abstract for an Elsevier-gated
/// abstract). Absent fields serialize to JSON `null` (not skipped) so
/// the wire shape is stable for agents.
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperHit {
    /// Bare DOI (lower-cased, `https://doi.org/` prefix stripped), or
    /// `None` when the record has no DOI.
    pub doi: Option<String>,
    /// OpenAlex Work ID (`W…`, `https://openalex.org/` prefix stripped).
    /// An empty string signals a malformed upstream record (the `id` field
    /// was absent) that was kept rather than dropped so one bad record does
    /// not sink the page — do NOT use `""` as a fetchable id.
    pub openalex_id: String,
    /// arXiv id, best-effort extracted from a `locations[].*url`
    /// containing `arxiv.org/abs/<id>`; `None` if no arXiv location.
    pub arxiv: Option<String>,
    /// Work title.
    pub title: String,
    /// Author display names, in OpenAlex authorship order.
    pub authors: Vec<String>,
    /// Publication year, or `None` if absent.
    pub year: Option<i32>,
    /// Primary venue display name (journal / repository), or `None`.
    pub venue: Option<String>,
    /// Reconstructed abstract text, or `None` when OpenAlex has no
    /// `abstract_inverted_index` for the record.
    #[serde(rename = "abstract")]
    pub abstract_: Option<String>,
    /// OpenAlex `cited_by_count`.
    pub cited_by_count: u64,
    /// OpenAlex open-access status (`gold` / `green` / `hybrid` /
    /// `bronze` / `closed`), or `None`.
    pub oa_status: Option<String>,
    /// Discovery backend that produced this hit (PR1: always
    /// [`DiscoverySource::OpenAlex`]). Serializes to `"openalex"`.
    pub source: DiscoverySource,
}

/// The result of a discovery search: the hits plus the upstream total.
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperSearchResults {
    /// The candidate papers (length ≤ `query.limit`).
    pub results: Vec<PaperHit>,
    /// OpenAlex `meta.count` — the total number of matching works
    /// upstream (usually far larger than `results.len()`), or `None` if
    /// the response omitted it. Lets an agent see "showing 25 of 4012".
    pub total_results: Option<u64>,
}

/// OpenAlex entity IDs resolved from the `--author` / `--venue` /
/// `--publisher` name filters (each `None` when the filter is unset).
#[derive(Debug, Default)]
struct ResolvedIds {
    /// Author ID (`A…`) for `authorships.author.id`.
    author: Option<String>,
    /// Source ID (`S…`) for `primary_location.source.id`.
    source: Option<String>,
    /// Publisher ID (`P…`) for `primary_location.source.publisher_lineage`.
    publisher: Option<String>,
}

/// Run a discovery search against OpenAlex and return ranked candidates.
///
/// `base` is the OpenAlex API base URL (production
/// `https://api.openalex.org`; tests inject a wiremock origin, mirroring
/// the `DOIGET_OPENALEX_BASE` override the CLI honors). `contact_email`
/// opts into the polite pool via `?mailto=` when non-empty.
///
/// When `query.author` / `query.venue` / `query.publisher` are set, this
/// first issues one `?search=` lookup each against `/authors` /
/// `/sources` / `/publishers` to resolve the name to an OpenAlex ID, then
/// filters `/works` by that ID. Every call reuses `ctx.http` (allowlisted,
/// HTTPS-only in production), `ctx.rate_limiter`, and `ctx.log` (one
/// `Metadata`/`Fetch` provenance row per request). Never fetches a PDF
/// (ADR-0031 D3).
///
/// ## Caller-side validation
///
/// This is permissive on the `query` shape — boundary validation is the
/// caller's job (the CLI does it; an MCP tool should too). Specifically:
/// `query.limit` is **clamped** to `1..=200` (not rejected), and an
/// inverted year range (`from_year > to_year`) is passed through and
/// yields an **empty** result set rather than an error. Direct callers
/// that want a typed error for those should pre-validate.
///
/// # Errors
///
/// Returns [`FetchError::Http`] for transport / allowlist failures,
/// [`FetchError::NotFound`] when an author/venue/publisher name resolves
/// to nothing, [`FetchError::Ambiguous`] when such a name matches several
/// entities with no clear winner (carries a candidate listing),
/// [`FetchError::SourceSchema`] when a response is not a JSON object
/// carrying a `results` array, and propagates a provenance-log append
/// failure (fail-closed).
pub async fn paper_search(
    base: &Url,
    contact_email: &str,
    query: &PaperSearchQuery,
    ctx: &FetchContext,
) -> Result<PaperSearchResults, FetchError> {
    // Resolve the name → ID filters first (one OpenAlex lookup each).
    let ids = ResolvedIds {
        author: resolve_optional(base, contact_email, "authors", &query.author, ctx).await?,
        source: resolve_optional(base, contact_email, "sources", &query.venue, ctx).await?,
        publisher: resolve_optional(base, contact_email, "publishers", &query.publisher, ctx)
            .await?,
    };

    let url = build_search_url(base, contact_email, query, &ids)?;
    let (value, _bytes) = openalex_get(&url, ctx).await?;

    let results_array = value
        .get("results")
        .and_then(serde_json::Value::as_array)
        .ok_or_else(|| missing_results_array("search", &value))?;

    let results: Vec<PaperHit> = results_array.iter().map(work_to_hit).collect();
    let total_results = value
        .get("meta")
        .and_then(|m| m.get("count"))
        .and_then(serde_json::Value::as_u64);

    Ok(PaperSearchResults {
        results,
        total_results,
    })
}

/// Issue one OpenAlex GET: rate-limit, fetch the JSON body, parse it, and
/// append the `Metadata`/`Fetch` provenance row. Returns the parsed value
/// plus the byte length (the caller needs neither beyond the value, but
/// the length keeps the provenance accounting in one place).
async fn openalex_get(
    url: &Url,
    ctx: &FetchContext,
) -> Result<(serde_json::Value, usize), FetchError> {
    // Step 1: rate limiter (politeness — same channel every source uses).
    let _permit = ctx.rate_limiter.acquire(SOURCE_KEY).await;

    // Step 2: HTTP fetch (JSON; `select=`/`per-page=` keep it small).
    let (body, _final_url) = ctx.http.fetch_bytes(SOURCE_KEY, url.clone()).await?;

    // Step 3: parse.
    let value: serde_json::Value =
        serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
            hint: format!("openalex returned non-JSON: {e}"),
        })?;

    // Step 4: provenance. Tier-1 metadata read; no single ref (it is a
    // query), so `ref_` / `canonical_digest` are null per
    // docs/PROVENANCE_LOG.md.
    ctx.log.append(RowInput {
        event: LogEvent::Fetch,
        result: LogResult::Ok,
        capability: Capability::Metadata,
        ref_: None,
        source: Some(SOURCE_KEY),
        error_code: None,
        size_bytes: Some(body.len() as u64),
        license: None,
        store_path: None,
        canonical_digest: None,
    })?;

    Ok((value, body.len()))
}

/// Resolve an optional name filter to an OpenAlex entity ID, or `None`
/// when the name is unset / blank.
async fn resolve_optional(
    base: &Url,
    contact_email: &str,
    entity_path: &str,
    name: &Option<String>,
    ctx: &FetchContext,
) -> Result<Option<String>, FetchError> {
    match name {
        Some(n) if !n.trim().is_empty() => Ok(Some(
            resolve_entity_id(base, contact_email, entity_path, n, ctx).await?,
        )),
        _ => Ok(None),
    }
}

/// Resolve a name to a single OpenAlex entity ID for `entity_path`
/// (`authors` / `sources` / `publishers`) via `?search=`.
///
/// OpenAlex `?search=` is partial / fuzzy and relevance-ranked, so a
/// vague name still matches. To avoid silently filtering by the wrong
/// entity, this fetches the top few candidates and applies
/// [`select_entity`]: an unambiguous name (single hit, an exact-name
/// match, or a clearly-dominant top hit) resolves; an ambiguous one is a
/// typed [`FetchError::Ambiguous`] that lists the candidates so the
/// caller can narrow the name. A name that matches nothing is
/// [`FetchError::NotFound`]. The filter is never silently dropped.
async fn resolve_entity_id(
    base: &Url,
    contact_email: &str,
    entity_path: &str,
    name: &str,
    ctx: &FetchContext,
) -> Result<String, FetchError> {
    let mut url = base
        .join(&format!("/{entity_path}"))
        .map_err(|e| FetchError::SourceSchema {
            hint: format!("openalex {entity_path} URL construction failed: {e}"),
        })?;
    {
        let mut qp = url.query_pairs_mut();
        qp.append_pair("search", name);
        // Top few candidates so an ambiguous name can be reported with
        // alternatives instead of silently resolving to the first hit.
        // No `select=` so OpenAlex returns `relevance_score` (only present
        // on search responses) alongside `display_name` / `works_count`.
        qp.append_pair("per-page", "5");
        if !contact_email.is_empty() {
            qp.append_pair("mailto", contact_email);
        }
    }

    let (value, _len) = openalex_get(&url, ctx).await?;
    // A valid JSON object with no `results` array is a schema failure
    // (e.g. an OpenAlex error envelope: rate limit / bad filter), NOT an
    // empty match set — mirror the `/works` path. Collapsing it to an
    // empty Vec here would surface a misleading "no <entity> matched"
    // NotFound and silently drop the user's filter.
    let results_arr = value
        .get("results")
        .and_then(serde_json::Value::as_array)
        .ok_or_else(|| missing_results_array(&format!("/{entity_path}"), &value))?;
    let mut candidates: Vec<Candidate> = results_arr
        .iter()
        .filter_map(Candidate::from_value)
        .collect();
    // OpenAlex returns search hits relevance-sorted, but make the
    // dominance check order-independent.
    candidates.sort_by(|a, b| {
        b.relevance
            .partial_cmp(&a.relevance)
            .unwrap_or(std::cmp::Ordering::Equal)
    });

    select_entity(entity_path, name, &candidates)
}

/// One OpenAlex entity-search candidate (author / source / publisher).
struct Candidate {
    /// Bare OpenAlex ID (`A…` / `S…` / `P…`).
    id: String,
    /// Entity display name (used for the exact-match check + listings).
    display_name: String,
    /// Number of works attributed to the entity (shown in the ambiguity
    /// listing so the caller can spot the prolific / canonical match).
    works_count: u64,
    /// OpenAlex `relevance_score` for the search query (0.0 if absent).
    relevance: f64,
}

impl Candidate {
    fn from_value(v: &serde_json::Value) -> Option<Self> {
        let id = v
            .get("id")
            .and_then(serde_json::Value::as_str)
            .map(strip_openalex_prefix)?;
        Some(Self {
            id,
            display_name: v
                .get("display_name")
                .and_then(serde_json::Value::as_str)
                .unwrap_or("")
                .to_string(),
            works_count: v
                .get("works_count")
                .and_then(serde_json::Value::as_u64)
                .unwrap_or(0),
            relevance: v
                .get("relevance_score")
                .and_then(serde_json::Value::as_f64)
                .unwrap_or(0.0),
        })
    }
}

/// Relevance-dominance ratio: with no exact-name match, the top hit must
/// out-score the runner-up by at least this factor to be auto-selected;
/// otherwise the name is treated as ambiguous.
const DOMINANCE_RATIO: f64 = 2.0;

/// Pick a single entity from relevance-sorted search `candidates`, or
/// report ambiguity.
///
/// Resolution order: empty → [`FetchError::NotFound`]; single candidate →
/// it; exactly one case-insensitive exact display-name match → it; else
/// the top hit when it out-scores the runner-up by [`DOMINANCE_RATIO`];
/// otherwise [`FetchError::Ambiguous`] listing the candidates.
fn select_entity(
    entity_path: &str,
    name: &str,
    candidates: &[Candidate],
) -> Result<String, FetchError> {
    let label = entity_label(entity_path);
    if candidates.is_empty() {
        return Err(FetchError::NotFound {
            hint: format!("no OpenAlex {label} matched '{name}'"),
        });
    }
    if candidates.len() == 1 {
        return Ok(candidates[0].id.clone());
    }

    let exact: Vec<&Candidate> = candidates
        .iter()
        .filter(|c| c.display_name.trim().eq_ignore_ascii_case(name.trim()))
        .collect();
    if exact.len() == 1 {
        return Ok(exact[0].id.clone());
    }

    if exact.is_empty() {
        let top = &candidates[0];
        let second = &candidates[1];
        // Both scores must be present (> 0.0): a runner-up with an absent
        // `relevance_score` (defaulted to 0.0) would otherwise make
        // `top >= RATIO * 0.0` trivially true and silently auto-select the
        // top hit, defeating the ambiguity guard. When the runner-up has
        // no score we cannot judge dominance — treat the name as ambiguous.
        if top.relevance > 0.0
            && second.relevance > 0.0
            && top.relevance >= DOMINANCE_RATIO * second.relevance
        {
            return Ok(top.id.clone());
        }
    }

    Err(FetchError::Ambiguous {
        hint: format_ambiguous(label, name, candidates),
    })
}

/// Singular human label for an OpenAlex entity path.
fn entity_label(entity_path: &str) -> &str {
    match entity_path {
        "authors" => "author",
        "sources" => "venue",
        "publishers" => "publisher",
        other => other,
    }
}

/// Render the ambiguity error: the query plus the candidate listing
/// (display name, id, works count) so the caller can narrow the name.
fn format_ambiguous(label: &str, name: &str, candidates: &[Candidate]) -> String {
    let mut s = format!(
        "ambiguous {label} '{name}' — {} candidates; narrow the name \
         (add a first name / fuller title) and retry:",
        candidates.len()
    );
    for c in candidates.iter().take(5) {
        s.push_str(&format!(
            "\n  {} ({}, {} works)",
            c.display_name, c.id, c.works_count
        ));
    }
    s
}

/// Build the `/works?search=&filter=&sort=&select=&per-page=&mailto=` URL.
fn build_search_url(
    base: &Url,
    contact_email: &str,
    query: &PaperSearchQuery,
    ids: &ResolvedIds,
) -> Result<Url, FetchError> {
    let mut url = base.join("/works").map_err(|e| FetchError::SourceSchema {
        hint: format!("openalex search URL construction failed: {e}"),
    })?;

    let per_page = query.limit.clamp(1, MAX_PER_PAGE);

    // Compose the comma-joined `filter=` value. OpenAlex treats commas as
    // an AND of clauses within a single `filter` parameter.
    let mut filters: Vec<String> = Vec::new();
    // Match on title + abstract only, as a FILTER rather than the loose
    // `search=` parameter (#290): `search=` includes full-text, which lets
    // off-topic full-text hits in; `title_and_abstract.search` is the
    // precision form. A comma in the query would split the comma-joined
    // filter list, so commas are normalised to spaces (they carry no search
    // meaning here).
    filters.push(format!(
        "title_and_abstract.search:{}",
        query.query.replace(',', " ")
    ));
    if let Some(from) = query.from_year {
        filters.push(format!("from_publication_date:{from}-01-01"));
    }
    if let Some(to) = query.to_year {
        filters.push(format!("to_publication_date:{to}-12-31"));
    }
    if query.oa_only {
        filters.push("is_oa:true".to_string());
    }
    if let Some(min) = query.min_citations {
        // `cited_by_count:>{n}` matches works cited strictly more than
        // `n` times. The off-by-one versus "at least n" is documented on
        // the CLI flag.
        filters.push(format!("cited_by_count:>{min}"));
    }
    if let Some(f) = query.min_fwci {
        // Field-and-year-normalized impact floor (#290): narrows the set
        // without overriding relevance, unlike a `sort=fwci`.
        filters.push(format!("fwci:>{f}"));
    }
    if let Some(p) = query.min_percentile {
        // Top-X% within the same-year cohort (#290).
        filters.push(format!("cited_by_percentile_year.min:{p}"));
    }
    if let Some(author_id) = &ids.author {
        filters.push(format!("authorships.author.id:{author_id}"));
    }
    if let Some(source_id) = &ids.source {
        filters.push(format!("primary_location.source.id:{source_id}"));
    }
    if let Some(publisher_id) = &ids.publisher {
        filters.push(format!(
            "primary_location.source.publisher_lineage:{publisher_id}"
        ));
    }

    {
        let mut qp = url.query_pairs_mut();
        // The query is now a `title_and_abstract.search` FILTER clause
        // (above), not the `search=` parameter (#290).
        qp.append_pair("per-page", &per_page.to_string());
        qp.append_pair("sort", query.sort.as_openalex());
        qp.append_pair("select", SELECT_FIELDS);
        // `filters` always carries at least the title_and_abstract.search
        // clause, so it is never empty here.
        qp.append_pair("filter", &filters.join(","));
        if !contact_email.is_empty() {
            qp.append_pair("mailto", contact_email);
        }
    }

    Ok(url)
}

/// Map one OpenAlex Work JSON object to a [`PaperHit`].
///
/// Tolerant of missing fields: anything absent becomes `None` / empty
/// rather than failing the whole search (one malformed record should not
/// sink the page).
fn work_to_hit(work: &serde_json::Value) -> PaperHit {
    let openalex_id = work
        .get("id")
        .and_then(serde_json::Value::as_str)
        .map(strip_openalex_prefix)
        .unwrap_or_default();

    let doi = work
        .get("doi")
        .and_then(serde_json::Value::as_str)
        .map(strip_doi_prefix);

    let title = work
        .get("title")
        .and_then(serde_json::Value::as_str)
        .or_else(|| work.get("display_name").and_then(serde_json::Value::as_str))
        .unwrap_or("")
        .to_string();

    let authors = work
        .get("authorships")
        .and_then(serde_json::Value::as_array)
        .map(|arr| {
            arr.iter()
                .filter_map(|a| {
                    a.get("author")
                        .and_then(|au| au.get("display_name"))
                        .and_then(serde_json::Value::as_str)
                        .map(str::to_string)
                })
                .collect()
        })
        .unwrap_or_default();

    let year = work
        .get("publication_year")
        .and_then(serde_json::Value::as_i64)
        .and_then(|y| i32::try_from(y).ok());

    let venue = work
        .get("primary_location")
        .and_then(|loc| loc.get("source"))
        .and_then(|src| src.get("display_name"))
        .and_then(serde_json::Value::as_str)
        .map(str::to_string);

    let abstract_ = work
        .get("abstract_inverted_index")
        .and_then(reconstruct_abstract);

    let cited_by_count = work
        .get("cited_by_count")
        .and_then(serde_json::Value::as_u64)
        .unwrap_or(0);

    let oa_status = work
        .get("open_access")
        .and_then(|oa| oa.get("oa_status"))
        .and_then(serde_json::Value::as_str)
        .map(str::to_string);

    let arxiv = work
        .get("locations")
        .and_then(serde_json::Value::as_array)
        .and_then(|locs| locs.iter().find_map(extract_arxiv_from_location));

    PaperHit {
        doi,
        openalex_id,
        arxiv,
        title,
        authors,
        year,
        venue,
        abstract_,
        cited_by_count,
        oa_status,
        source: DiscoverySource::OpenAlex,
    }
}

/// Reconstruct plain abstract text from OpenAlex's
/// `abstract_inverted_index` (`{ word: [positions...] }`). Returns `None`
/// for a null / empty / non-object value.
fn reconstruct_abstract(inv: &serde_json::Value) -> Option<String> {
    let map = inv.as_object()?;
    if map.is_empty() {
        return None;
    }
    let mut positioned: Vec<(u64, &str)> = Vec::new();
    for (word, positions) in map {
        if let Some(arr) = positions.as_array() {
            for p in arr {
                if let Some(pos) = p.as_u64() {
                    positioned.push((pos, word.as_str()));
                }
            }
        }
    }
    if positioned.is_empty() {
        return None;
    }
    positioned.sort_by_key(|(pos, _)| *pos);
    let words: Vec<&str> = positioned.into_iter().map(|(_, w)| w).collect();
    Some(words.join(" "))
}

/// Best-effort arXiv id extraction from a single OpenAlex location's
/// `landing_page_url` / `pdf_url`. Looks for `arxiv.org/abs/<id>` and
/// returns `<id>` (a trailing `vN` version is kept — the downstream
/// parser accepts it).
fn extract_arxiv_from_location(loc: &serde_json::Value) -> Option<String> {
    for key in ["landing_page_url", "pdf_url"] {
        if let Some(u) = loc.get(key).and_then(serde_json::Value::as_str) {
            if let Some(idx) = u.find("arxiv.org/abs/") {
                let after = &u[idx + "arxiv.org/abs/".len()..];
                let id: String = after
                    .chars()
                    .take_while(|c| !matches!(c, '?' | '#' | '/' | ' '))
                    .collect();
                if !id.is_empty() {
                    return Some(id);
                }
            }
        }
    }
    None
}

/// Strip the `https://openalex.org/` prefix from an entity id, yielding
/// the bare `W…` / `A…` / `S…` / `P…` form.
fn strip_openalex_prefix(id: &str) -> String {
    id.rsplit('/').next().unwrap_or(id).to_string()
}

/// Strip the `https://doi.org/` (or `http://…`) prefix from a DOI URL and
/// lower-case it (DOIs are case-insensitive; lower-case is the canonical
/// store form).
fn strip_doi_prefix(doi_url: &str) -> String {
    let lower = doi_url.to_ascii_lowercase();
    lower
        .strip_prefix("https://doi.org/")
        .or_else(|| lower.strip_prefix("http://doi.org/"))
        .unwrap_or(&lower)
        .to_string()
}

/// Truncate a response body to a short prefix for error hints, so a
/// multi-KB malformed payload does not flood a single log line.
///
/// Truncation is by `char` (not byte) so a multi-byte UTF-8 character
/// straddling the cap — common in OpenAlex error payloads, which embed
/// `…`/curly quotes — never panics on a non-char-boundary byte slice.
fn truncate_for_hint(body: &[u8]) -> String {
    const MAX: usize = 200;
    let s = String::from_utf8_lossy(body);
    if s.chars().count() <= MAX {
        s.into_owned()
    } else {
        let head: String = s.chars().take(MAX).collect();
        format!("{head}…")
    }
}

/// Build the `SourceSchema` error for an OpenAlex response that is valid
/// JSON but carries no `results` array (e.g. an error envelope). Shared by
/// the `/works` search path and the entity-resolution path so the
/// "do NOT collapse to an empty Vec" contract lives in one place.
/// `context` names the endpoint for the hint (`"search"` or `"/authors"`).
fn missing_results_array(context: &str, value: &serde_json::Value) -> FetchError {
    FetchError::SourceSchema {
        hint: format!(
            "openalex {context} response missing `results` array — likely an \
             error payload (got: {})",
            truncate_for_hint(value.to_string().as_bytes())
        ),
    }
}

// ---------------------------------------------------------------------------
// DOI ↔ arXiv linking (#281 item 5)
// ---------------------------------------------------------------------------

/// The cross-identifier "identity cluster" for a single work: its DOI, its
/// arXiv preprint id (when one exists), the OpenAlex Work id, and the
/// title.
///
/// This is the primitive behind #281 item 5 (arXiv ↔ published-DOI
/// linking & dedup): given a published DOI, an agent can discover whether a
/// free arXiv preprint of the **same work** exists (to read its full text,
/// or to avoid fetching the preprint and the journal version twice).
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperLinks {
    /// Bare DOI (lower-cased), or `None` if OpenAlex has none for the work.
    pub doi: Option<String>,
    /// arXiv id of the preprint of this work, or `None` when no arXiv
    /// location is recorded. A trailing version (`v2`) is kept.
    pub arxiv: Option<String>,
    /// OpenAlex Work id (`W…`).
    pub openalex_id: String,
    /// Work title.
    pub title: String,
}

/// Resolve the [`PaperLinks`] identity cluster for a **DOI** via OpenAlex
/// (`/works?filter=doi:<doi>`), in particular whether the work has an arXiv
/// preprint.
///
/// `base` / `contact_email` / `ctx` are used exactly as in
/// [`paper_search`] (Tier-1 OA metadata, always-on; a single bounded
/// `/works` query; `HttpClient::fetch_bytes`, never a PDF). The arXiv id
/// is extracted from the work's `locations[]` / `primary_location` /
/// `best_oa_location` URLs (`arxiv.org/abs/<id>`), reusing the same logic
/// as discovery search.
///
/// # Errors
///
/// [`FetchError::NotFound`] when no OpenAlex work matches the DOI,
/// [`FetchError::SourceSchema`] when the response is not a JSON object with
/// a `results` array — or when the matched work carries no `id`,
/// [`FetchError::Http`] for transport failures, and propagates a
/// provenance-log append failure (fail-closed).
pub async fn resolve_links_for_doi(
    base: &Url,
    contact_email: &str,
    doi: &str,
    ctx: &FetchContext,
) -> Result<PaperLinks, FetchError> {
    let url = build_doi_lookup_url(base, contact_email, doi)?;
    let (value, _bytes) = openalex_get(&url, ctx).await?;

    let results = value
        .get("results")
        .and_then(serde_json::Value::as_array)
        .ok_or_else(|| missing_results_array("doi-lookup", &value))?;

    let work = results.first().ok_or_else(|| FetchError::NotFound {
        hint: format!("no OpenAlex work matched doi '{doi}'"),
    })?;

    let links = work_to_links(work);
    // A matched work always carries an `id`; an empty one means the record
    // was malformed. Surface it as a schema error rather than returning a
    // cluster with a blank `openalex_id` (review #287).
    if links.openalex_id.is_empty() {
        return Err(FetchError::SourceSchema {
            hint: format!("openalex work for doi '{doi}' has no id"),
        });
    }
    Ok(links)
}

/// Build the `/works?filter=doi:<doi>&select=&per-page=1&mailto=` URL for
/// the single-work DOI lookup. The `filter` value is URL-encoded by
/// `query_pairs_mut`, so a DOI's `/` and `:` are carried safely (unlike a
/// `/works/doi:<doi>` path form, where the suffix `/` would split the
/// path).
fn build_doi_lookup_url(base: &Url, contact_email: &str, doi: &str) -> Result<Url, FetchError> {
    let mut url = base.join("/works").map_err(|e| FetchError::SourceSchema {
        hint: format!("openalex doi-lookup URL construction failed: {e}"),
    })?;
    {
        let mut qp = url.query_pairs_mut();
        qp.append_pair("filter", &format!("doi:{doi}"));
        qp.append_pair("per-page", "1");
        qp.append_pair(
            "select",
            "id,doi,title,display_name,locations,primary_location,best_oa_location",
        );
        if !contact_email.is_empty() {
            qp.append_pair("mailto", contact_email);
        }
    }
    Ok(url)
}

/// Map one OpenAlex Work JSON object to a [`PaperLinks`]. Scans
/// `locations[]`, then `primary_location` / `best_oa_location`, for an
/// arXiv URL.
fn work_to_links(work: &serde_json::Value) -> PaperLinks {
    let openalex_id = work
        .get("id")
        .and_then(serde_json::Value::as_str)
        .map(strip_openalex_prefix)
        .unwrap_or_default();

    let doi = work
        .get("doi")
        .and_then(serde_json::Value::as_str)
        .map(strip_doi_prefix);

    let title = work
        .get("title")
        .and_then(serde_json::Value::as_str)
        .or_else(|| work.get("display_name").and_then(serde_json::Value::as_str))
        .unwrap_or("")
        .to_string();

    let arxiv = work
        .get("locations")
        .and_then(serde_json::Value::as_array)
        .and_then(|locs| locs.iter().find_map(extract_arxiv_from_location))
        .or_else(|| {
            work.get("primary_location")
                .and_then(extract_arxiv_from_location)
        })
        .or_else(|| {
            work.get("best_oa_location")
                .and_then(extract_arxiv_from_location)
        });

    PaperLinks {
        doi,
        arxiv,
        openalex_id,
        title,
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
    use super::*;

    use std::sync::Arc;

    use camino::Utf8PathBuf;
    use tempfile::TempDir;
    use wiremock::matchers::{method, path, query_param};
    use wiremock::{Mock, MockServer, ResponseTemplate};

    use crate::http::HttpClient;
    use crate::provenance::ProvenanceLog;
    use crate::rate_limiter::RateLimiter;
    use crate::RateLimits;

    /// Hand-crafted (not a snapshot) OpenAlex `/works` search response.
    /// Synthetic to avoid third-party redistribution concerns; exercises
    /// every `PaperHit` field including abstract reconstruction, arXiv
    /// extraction, and the all-absent record.
    const SAMPLE_SEARCH: &str = r#"{
        "meta": { "count": 4012, "per_page": 25 },
        "results": [
            {
                "id": "https://openalex.org/W123",
                "doi": "https://doi.org/10.1234/Example",
                "title": "Tropical Tensor Networks",
                "display_name": "Tropical Tensor Networks",
                "publication_year": 2021,
                "cited_by_count": 42,
                "abstract_inverted_index": { "Tropical": [0], "tensor": [1], "networks": [2] },
                "authorships": [
                    { "author": { "display_name": "Ada Lovelace" } },
                    { "author": { "display_name": "Alan Turing" } }
                ],
                "primary_location": { "source": { "display_name": "Phys. Rev. B" } },
                "open_access": { "oa_status": "green", "is_oa": true },
                "locations": [
                    { "landing_page_url": "https://arxiv.org/abs/2101.12345v2" }
                ]
            },
            {
                "id": "https://openalex.org/W456",
                "doi": null,
                "title": "Second Paper",
                "publication_year": 2019,
                "cited_by_count": 7,
                "abstract_inverted_index": null,
                "authorships": [],
                "open_access": { "oa_status": "closed" }
            }
        ]
    }"#;

    fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
        let td = TempDir::new().expect("tempdir");
        let log_dir =
            Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
        let log_path = log_dir.join("test.jsonl");

        let http = Arc::new(HttpClient::new_for_tests_allow_http(
            "openalex",
            wiremock_host,
        ));
        let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
        let session_id = "01J0000000000000000000TEST".to_string();
        let log = Arc::new(
            ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
        );
        let ctx = FetchContext {
            http,
            rate_limiter,
            log,
            session_id,
            cache_root: None,
        };
        (td, ctx)
    }

    #[tokio::test]
    async fn search_maps_works_to_hits() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/works"))
            // #290: the query is a `title_and_abstract.search` filter clause.
            .and(query_param(
                "filter",
                "title_and_abstract.search:tropical tensor networks",
            ))
            .and(query_param("mailto", "doiget@localhost"))
            .respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_SEARCH))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let q = PaperSearchQuery::new("tropical tensor networks");

        let out = paper_search(&base, "doiget@localhost", &q, &ctx)
            .await
            .expect("search ok");

        assert_eq!(out.total_results, Some(4012));
        assert_eq!(out.results.len(), 2);

        let first = &out.results[0];
        assert_eq!(first.openalex_id, "W123");
        assert_eq!(first.doi.as_deref(), Some("10.1234/example")); // lower-cased
        assert_eq!(first.title, "Tropical Tensor Networks");
        assert_eq!(first.year, Some(2021));
        assert_eq!(first.cited_by_count, 42);
        assert_eq!(first.abstract_.as_deref(), Some("Tropical tensor networks"));
        assert_eq!(first.authors, vec!["Ada Lovelace", "Alan Turing"]);
        assert_eq!(first.venue.as_deref(), Some("Phys. Rev. B"));
        assert_eq!(first.oa_status.as_deref(), Some("green"));
        assert_eq!(first.arxiv.as_deref(), Some("2101.12345v2"));
        assert_eq!(first.source, DiscoverySource::OpenAlex);

        let second = &out.results[1];
        assert_eq!(second.openalex_id, "W456");
        assert_eq!(second.doi, None);
        assert_eq!(second.abstract_, None);
        assert_eq!(second.venue, None);
        assert!(second.authors.is_empty());
        assert_eq!(second.oa_status.as_deref(), Some("closed"));
        assert_eq!(second.arxiv, None);
    }

    #[tokio::test]
    async fn search_filters_and_sort_land_on_the_url() {
        let server = MockServer::start().await;
        // Assert the composed filter + sort params reach the wire.
        Mock::given(method("GET"))
            .and(path("/works"))
            // #290: relevance is the only sort; the query is the leading
            // `title_and_abstract.search` filter clause.
            .and(query_param("sort", "relevance_score:desc"))
            .and(query_param(
                "filter",
                "title_and_abstract.search:spin glass,from_publication_date:2020-01-01,is_oa:true,cited_by_count:>10",
            ))
            .and(query_param("per-page", "5"))
            .respond_with(
                ResponseTemplate::new(200)
                    .set_body_string(r#"{ "meta": { "count": 0 }, "results": [] }"#),
            )
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let q = PaperSearchQuery {
            query: "spin glass".to_string(),
            limit: 5,
            from_year: Some(2020),
            to_year: None,
            oa_only: true,
            min_citations: Some(10),
            min_fwci: None,
            min_percentile: None,
            author: None,
            venue: None,
            publisher: None,
            sort: SearchSort::Relevance,
        };

        let out = paper_search(&base, "doiget@localhost", &q, &ctx)
            .await
            .expect("search ok");
        assert_eq!(out.total_results, Some(0));
        assert!(out.results.is_empty());
    }

    #[tokio::test]
    async fn search_error_payload_is_source_schema() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .respond_with(
                ResponseTemplate::new(200)
                    .set_body_string(r#"{"error":"Invalid query parameters"}"#),
            )
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let q = PaperSearchQuery::new("anything");

        let err = paper_search(&base, "", &q, &ctx)
            .await
            .expect_err("missing `results` must surface as SourceSchema");
        assert!(matches!(err, FetchError::SourceSchema { .. }));
    }

    #[test]
    fn name_filters_compose_into_resolved_ids() {
        let base = Url::parse("https://api.openalex.org").expect("base parses");
        let q = PaperSearchQuery::new("topic");
        let ids = ResolvedIds {
            author: Some("A1".to_string()),
            source: Some("S2".to_string()),
            publisher: Some("P3".to_string()),
        };
        let url = build_search_url(&base, "", &q, &ids).expect("url builds");
        let filter = url
            .query_pairs()
            .find(|(k, _)| k == "filter")
            .map(|(_, v)| v.into_owned())
            .expect("filter param present");
        assert!(filter.contains("authorships.author.id:A1"), "got {filter}");
        assert!(
            filter.contains("primary_location.source.id:S2"),
            "got {filter}"
        );
        assert!(
            filter.contains("primary_location.source.publisher_lineage:P3"),
            "got {filter}"
        );
        // An empty contact email must omit the `mailto` parameter entirely
        // (never send a placeholder).
        assert!(
            param(&url, "mailto").is_none(),
            "empty contact email must omit mailto"
        );
    }

    #[tokio::test]
    async fn venue_name_resolves_to_source_id_then_filters_works() {
        let server = MockServer::start().await;
        // First leg: /sources?search=... → top hit S99.
        Mock::given(method("GET"))
            .and(path("/sources"))
            .and(query_param("search", "Physical Review B"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "results": [ { "id": "https://openalex.org/S99", "display_name": "Physical Review B" } ] }"#,
            ))
            .mount(&server)
            .await;
        // Second leg: /works filtered by the resolved source id.
        Mock::given(method("GET"))
            .and(path("/works"))
            .and(query_param(
                "filter",
                "title_and_abstract.search:spin glass,primary_location.source.id:S99",
            ))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W1", "title": "In PRB" } ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("spin glass");
        q.venue = Some("Physical Review B".to_string());

        let out = paper_search(&base, "", &q, &ctx)
            .await
            .expect("venue-filtered search ok");
        assert_eq!(out.total_results, Some(1));
        assert_eq!(out.results.len(), 1);
        assert_eq!(out.results[0].openalex_id, "W1");
    }

    #[tokio::test]
    async fn unresolvable_venue_name_is_not_found() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/sources"))
            .respond_with(ResponseTemplate::new(200).set_body_string(r#"{ "results": [] }"#))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("spin glass");
        q.venue = Some("No Such Journal".to_string());

        let err = paper_search(&base, "", &q, &ctx)
            .await
            .expect_err("an unresolvable venue name must error, not silently drop the filter");
        assert!(matches!(err, FetchError::NotFound { .. }), "got {err:?}");
    }

    #[tokio::test]
    async fn entity_error_envelope_is_source_schema_not_not_found() {
        let server = MockServer::start().await;
        // A valid JSON object with NO `results` key (an OpenAlex error
        // envelope, e.g. a rate limit) must surface as SourceSchema, NOT a
        // misleading "no author matched" NotFound that drops the filter.
        Mock::given(method("GET"))
            .and(path("/authors"))
            .respond_with(
                ResponseTemplate::new(200).set_body_string(r#"{"error":"rate limit exceeded"}"#),
            )
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("x");
        q.author = Some("Parisi".to_string());

        let err = paper_search(&base, "", &q, &ctx)
            .await
            .expect_err("an entity error envelope must be SourceSchema, not NotFound");
        assert!(
            matches!(err, FetchError::SourceSchema { .. }),
            "got {err:?}"
        );
    }

    #[tokio::test]
    async fn exact_name_match_resolves_amid_namesakes() {
        let server = MockServer::start().await;
        // Three sources match the search; only one is an exact name match.
        Mock::given(method("GET"))
            .and(path("/sources"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "results": [
                    { "id": "https://openalex.org/S1", "display_name": "Physical Review B", "works_count": 50000, "relevance_score": 80.0 },
                    { "id": "https://openalex.org/S2", "display_name": "Physical Review B: Condensed Matter", "works_count": 1000, "relevance_score": 78.0 },
                    { "id": "https://openalex.org/S3", "display_name": "Reviews of Physics", "works_count": 200, "relevance_score": 70.0 }
                ] }"#,
            ))
            .mount(&server)
            .await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .and(query_param(
                "filter",
                "title_and_abstract.search:spin glass,primary_location.source.id:S1",
            ))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W1", "title": "x" } ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("spin glass");
        q.venue = Some("Physical Review B".to_string());

        let out = paper_search(&base, "", &q, &ctx)
            .await
            .expect("exact venue name must resolve to S1 amid namesakes");
        assert_eq!(out.results[0].openalex_id, "W1");
    }

    #[tokio::test]
    async fn dominant_top_hit_resolves_for_vague_name() {
        let server = MockServer::start().await;
        // No exact match for "parisi", but the top hit dominates (>=2x).
        Mock::given(method("GET"))
            .and(path("/authors"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "results": [
                    { "id": "https://openalex.org/A1", "display_name": "Giorgio Parisi", "works_count": 400, "relevance_score": 100.0 },
                    { "id": "https://openalex.org/A2", "display_name": "M. Parisi", "works_count": 10, "relevance_score": 20.0 }
                ] }"#,
            ))
            .mount(&server)
            .await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .and(query_param(
                "filter",
                "title_and_abstract.search:replica symmetry breaking,authorships.author.id:A1",
            ))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W9", "title": "y" } ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("replica symmetry breaking");
        q.author = Some("parisi".to_string());

        let out = paper_search(&base, "", &q, &ctx)
            .await
            .expect("a dominant top hit must resolve a vague name");
        assert_eq!(out.results[0].openalex_id, "W9");
    }

    #[tokio::test]
    async fn ambiguous_name_errors_with_candidate_listing() {
        let server = MockServer::start().await;
        // Two close, non-exact matches → ambiguous; no /works call.
        Mock::given(method("GET"))
            .and(path("/authors"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "results": [
                    { "id": "https://openalex.org/A1", "display_name": "John Smith", "works_count": 300, "relevance_score": 50.0 },
                    { "id": "https://openalex.org/A2", "display_name": "Jane Smith", "works_count": 280, "relevance_score": 45.0 }
                ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let mut q = PaperSearchQuery::new("electrons");
        q.author = Some("Smith".to_string());

        let err = paper_search(&base, "", &q, &ctx)
            .await
            .expect_err("a close, non-exact multi-match must be reported as ambiguous");
        match err {
            FetchError::Ambiguous { hint } => {
                assert!(hint.contains("John Smith"), "hint lists candidates: {hint}");
                assert!(hint.contains("Jane Smith"), "hint lists candidates: {hint}");
            }
            other => panic!("expected Ambiguous, got {other:?}"),
        }
    }

    #[test]
    fn abstract_reconstruction_orders_by_position() {
        let inv = serde_json::json!({
            "world": [1],
            "hello": [0],
            "again": [3],
            "hello2": [2]
        });
        // positions: 0=hello, 1=world, 2=hello2, 3=again
        assert_eq!(
            reconstruct_abstract(&inv).as_deref(),
            Some("hello world hello2 again")
        );
        assert_eq!(reconstruct_abstract(&serde_json::Value::Null), None);
        assert_eq!(reconstruct_abstract(&serde_json::json!({})), None);
    }

    #[test]
    fn doi_and_openalex_prefixes_are_stripped() {
        assert_eq!(
            strip_doi_prefix("https://doi.org/10.1234/ABC"),
            "10.1234/abc"
        );
        assert_eq!(strip_openalex_prefix("https://openalex.org/W999"), "W999");
    }

    // ---- resolve_links_for_doi (#281 item 5) -----------------------------

    #[tokio::test]
    async fn doi_lookup_extracts_arxiv_preprint() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .and(query_param("filter", "doi:10.1103/physrevb.1"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "meta": { "count": 1 }, "results": [ {
                    "id": "https://openalex.org/W55",
                    "doi": "https://doi.org/10.1103/PhysRevB.1",
                    "title": "Published Version",
                    "locations": [
                        { "landing_page_url": "https://journals.aps.org/prb/abstract/x" },
                        { "pdf_url": "https://arxiv.org/abs/2101.54321v2" }
                    ]
                } ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("wiremock URI parses");
        let links = resolve_links_for_doi(&base, "", "10.1103/physrevb.1", &ctx)
            .await
            .expect("doi lookup ok");
        assert_eq!(links.openalex_id, "W55");
        assert_eq!(links.doi.as_deref(), Some("10.1103/physrevb.1")); // lower-cased
        assert_eq!(links.arxiv.as_deref(), Some("2101.54321v2"));
        assert_eq!(links.title, "Published Version");
    }

    #[tokio::test]
    async fn doi_lookup_without_arxiv_location_is_none() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .respond_with(ResponseTemplate::new(200).set_body_string(
                r#"{ "meta": { "count": 1 }, "results": [ {
                    "id": "https://openalex.org/W7",
                    "doi": "https://doi.org/10.1234/closed",
                    "title": "No Preprint",
                    "locations": [ { "landing_page_url": "https://example.com/x" } ]
                } ] }"#,
            ))
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("uri");
        let links = resolve_links_for_doi(&base, "", "10.1234/closed", &ctx)
            .await
            .expect("ok");
        assert_eq!(links.arxiv, None);
        assert_eq!(links.openalex_id, "W7");
    }

    #[tokio::test]
    async fn doi_lookup_unknown_doi_is_not_found() {
        let server = MockServer::start().await;
        Mock::given(method("GET"))
            .and(path("/works"))
            .respond_with(
                ResponseTemplate::new(200)
                    .set_body_string(r#"{ "meta": { "count": 0 }, "results": [] }"#),
            )
            .mount(&server)
            .await;

        let (_td, ctx) = build_test_context(&server.uri());
        let base = Url::parse(&server.uri()).expect("uri");
        let err = resolve_links_for_doi(&base, "", "10.0000/nope", &ctx)
            .await
            .expect_err("an unmatched doi must be NotFound");
        assert!(matches!(err, FetchError::NotFound { .. }), "got {err:?}");
    }

    #[test]
    fn doi_lookup_url_preserves_input_doi_case() {
        // The correctness of `link` rests on "Doi::parse does not lower-case,
        // OpenAlex is case-insensitive": a real `doiget link 10.1103/PhysRevB.1`
        // must send the DOI verbatim in the filter (review #287). Pin it.
        let base = Url::parse("https://api.openalex.org").expect("base");
        let u = build_doi_lookup_url(&base, "", "10.1103/PhysRevB.1").expect("url");
        assert_eq!(
            param(&u, "filter").as_deref(),
            Some("doi:10.1103/PhysRevB.1"),
            "the input DOI case must be carried through verbatim"
        );
    }

    #[test]
    fn doi_lookup_url_carries_filter_and_select() {
        let base = Url::parse("https://api.openalex.org").expect("base");
        let u = build_doi_lookup_url(&base, "", "10.1/x").expect("url");
        assert_eq!(
            param(&u, "filter").as_deref(),
            Some("doi:10.1/x"),
            "doi filter must be url-encoded into the query"
        );
        assert!(param(&u, "select")
            .unwrap_or_default()
            .contains("locations"));
        assert_eq!(param(&u, "per-page").as_deref(), Some("1"));
    }

    // ---- build_search_url branch coverage --------------------------------

    fn param(u: &Url, key: &str) -> Option<String> {
        u.query_pairs()
            .find(|(k, _)| k == key)
            .map(|(_, v)| v.into_owned())
    }

    #[test]
    fn per_page_clamps_to_floor_and_ceiling() {
        let base = Url::parse("https://api.openalex.org").expect("base");
        let mut q = PaperSearchQuery::new("x");
        q.limit = 0;
        let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
        assert_eq!(param(&u, "per-page").as_deref(), Some("1"), "limit 0 -> 1");
        q.limit = 201;
        let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
        assert_eq!(
            param(&u, "per-page").as_deref(),
            Some("200"),
            "limit 201 -> 200"
        );
    }

    #[test]
    fn to_year_filter_and_relevance_only_sort_land_on_url() {
        let base = Url::parse("https://api.openalex.org").expect("base");
        let mut q = PaperSearchQuery::new("x");
        q.to_year = Some(2023);
        let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
        // Relevance is the only sort (#290).
        assert_eq!(param(&u, "sort").as_deref(), Some("relevance_score:desc"));
        assert!(
            param(&u, "filter")
                .unwrap_or_default()
                .contains("to_publication_date:2023-12-31"),
            "to_year must map to to_publication_date:<y>-12-31"
        );
    }

    #[test]
    fn query_is_a_title_and_abstract_filter_not_search_param(/* #290 */) {
        let base = Url::parse("https://api.openalex.org").expect("base");
        let mut q = PaperSearchQuery::new("classical shadows");
        q.min_fwci = Some(5.0);
        q.min_percentile = Some(90);
        let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
        // The query is a filter clause now; no top-level `search=` param.
        assert_eq!(param(&u, "search"), None, "no loose `search=` param");
        let filter = param(&u, "filter").unwrap_or_default();
        assert!(
            filter.contains("title_and_abstract.search:classical shadows"),
            "query must be a title_and_abstract.search filter: {filter}"
        );
        assert!(filter.contains("fwci:>5"), "min_fwci filter: {filter}");
        assert!(
            filter.contains("cited_by_percentile_year.min:90"),
            "min_percentile filter: {filter}"
        );
    }

    // ---- select_entity disambiguation boundaries -------------------------

    fn cand(id: &str, name: &str, works: u64, rel: f64) -> Candidate {
        Candidate {
            id: id.to_string(),
            display_name: name.to_string(),
            works_count: works,
            relevance: rel,
        }
    }

    #[test]
    fn dominance_at_exactly_2x_resolves_top() {
        let c = vec![cand("A1", "x", 1, 2.0), cand("A2", "y", 1, 1.0)];
        assert_eq!(select_entity("authors", "q", &c).expect("resolves"), "A1");
    }

    #[test]
    fn dominance_just_below_2x_is_ambiguous() {
        let c = vec![cand("A1", "x", 1, 1.9), cand("A2", "y", 1, 1.0)];
        assert!(matches!(
            select_entity("authors", "q", &c),
            Err(FetchError::Ambiguous { .. })
        ));
    }

    #[test]
    fn zero_relevance_runner_up_is_ambiguous_not_auto_top() {
        // Runner-up with an absent (0.0) relevance must NOT let the top
        // win by default — the dominance guard requires second > 0.0.
        let c = vec![cand("A1", "x", 1, 5.0), cand("A2", "y", 1, 0.0)];
        assert!(matches!(
            select_entity("authors", "q", &c),
            Err(FetchError::Ambiguous { .. })
        ));
    }

    #[test]
    fn multiple_exact_name_matches_are_ambiguous() {
        // Two entities share the exact display name -> ambiguous, even
        // though the first would otherwise dominate on relevance.
        let c = vec![cand("S1", "Dup", 9, 5.0), cand("S2", "Dup", 1, 1.0)];
        assert!(matches!(
            select_entity("sources", "Dup", &c),
            Err(FetchError::Ambiguous { .. })
        ));
    }

    // ---- extract_arxiv_from_location edge cases --------------------------

    #[test]
    fn arxiv_extracted_from_pdf_url_when_landing_absent() {
        let loc = serde_json::json!({ "pdf_url": "https://arxiv.org/abs/2302.00001v3" });
        assert_eq!(
            extract_arxiv_from_location(&loc).as_deref(),
            Some("2302.00001v3")
        );
    }

    #[test]
    fn arxiv_id_stops_at_query_string() {
        let loc =
            serde_json::json!({ "landing_page_url": "https://arxiv.org/abs/2101.12345?utm=x" });
        assert_eq!(
            extract_arxiv_from_location(&loc).as_deref(),
            Some("2101.12345")
        );
    }

    #[test]
    fn truncate_for_hint_is_char_boundary_safe() {
        // 300 multi-byte chars: must not panic on a byte-slice boundary.
        let body = "あ".repeat(300);
        let out = truncate_for_hint(body.as_bytes());
        assert!(out.ends_with('…'));
        assert_eq!(out.chars().filter(|&c| c == 'あ').count(), 200);
    }

    #[test]
    fn ambiguous_has_its_own_wire_code() {
        // Distinct from NOT_FOUND so agents can branch (ADR-0031 D5).
        let e = FetchError::Ambiguous { hint: "x".into() };
        assert_eq!(crate::ErrorCode::from(&e), crate::ErrorCode::Ambiguous);
        assert_eq!(crate::ErrorCode::Ambiguous.as_wire(), "AMBIGUOUS");
    }

    #[test]
    fn validate_rejects_bad_shape_and_accepts_good() {
        let mut q = PaperSearchQuery::new("topic");
        assert!(q.validate().is_ok());

        q.query = "  ".to_string();
        assert!(q.validate().unwrap_err().contains("empty"));

        let mut q = PaperSearchQuery::new("topic");
        q.limit = 0;
        assert!(q.validate().unwrap_err().contains("limit"));
        q.limit = MAX_PER_PAGE + 1;
        assert!(q.validate().unwrap_err().contains("limit"));

        let mut q = PaperSearchQuery::new("topic");
        q.from_year = Some(2025);
        q.to_year = Some(2010);
        assert!(q.validate().unwrap_err().contains("after"));
        // Equal bounds are valid (inclusive range).
        q.to_year = Some(2025);
        assert!(q.validate().is_ok());
    }

    #[test]
    fn validate_rejects_out_of_range_impact_filters() {
        // #290 / review #318: a negative or non-finite `min_fwci`, or a
        // percentile above 100, would compose a malformed OpenAlex filter
        // clause. `validate()` must reject them at the boundary instead.
        let mut q = PaperSearchQuery::new("topic");
        q.min_fwci = Some(-1.0);
        assert!(q.validate().unwrap_err().contains("min_fwci"));
        q.min_fwci = Some(f64::NAN);
        assert!(q.validate().unwrap_err().contains("min_fwci"));
        q.min_fwci = Some(f64::INFINITY);
        assert!(q.validate().unwrap_err().contains("min_fwci"));
        // A valid floor passes.
        q.min_fwci = Some(2.5);
        assert!(q.validate().is_ok());

        let mut q = PaperSearchQuery::new("topic");
        q.min_percentile = Some(101);
        assert!(q.validate().unwrap_err().contains("min_percentile"));
        // Boundary value 100 is valid (top 0%, i.e. the single best cohort
        // rank); 0 is valid (no floor).
        q.min_percentile = Some(100);
        assert!(q.validate().is_ok());
        q.min_percentile = Some(0);
        assert!(q.validate().is_ok());
    }
}