use serde::Serialize;
use url::Url;
use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError};
const SOURCE_KEY: &str = "openalex";
const SELECT_FIELDS: &str = "id,doi,title,display_name,publication_year,\
cited_by_count,abstract_inverted_index,authorships,primary_location,\
open_access,locations";
pub const MAX_PER_PAGE: usize = 200;
pub const DEFAULT_LIMIT: usize = 25;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SearchSort {
Relevance,
Cited,
Recent,
}
impl SearchSort {
#[must_use]
pub fn as_openalex(self) -> &'static str {
match self {
SearchSort::Relevance => "relevance_score:desc",
SearchSort::Cited => "cited_by_count:desc",
SearchSort::Recent => "publication_date:desc",
}
}
}
#[derive(Debug, Clone)]
pub struct PaperSearchQuery {
pub query: String,
pub limit: usize,
pub from_year: Option<i32>,
pub to_year: Option<i32>,
pub oa_only: bool,
pub min_citations: Option<u64>,
pub author: Option<String>,
pub venue: Option<String>,
pub publisher: Option<String>,
pub sort: SearchSort,
}
impl PaperSearchQuery {
#[must_use]
pub fn new(query: impl Into<String>) -> Self {
Self {
query: query.into(),
limit: DEFAULT_LIMIT,
from_year: None,
to_year: None,
oa_only: false,
min_citations: None,
author: None,
venue: None,
publisher: None,
sort: SearchSort::Relevance,
}
}
pub fn validate(&self) -> Result<(), String> {
if self.query.trim().is_empty() {
return Err("search query is empty".to_string());
}
if !(1..=MAX_PER_PAGE).contains(&self.limit) {
return Err(format!(
"limit must be between 1 and {MAX_PER_PAGE} (got {})",
self.limit
));
}
if let (Some(from), Some(to)) = (self.from_year, self.to_year) {
if from > to {
return Err(format!("from_year ({from}) is after to_year ({to})"));
}
}
Ok(())
}
}
#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
#[non_exhaustive]
pub enum DiscoverySource {
OpenAlex,
}
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperHit {
pub doi: Option<String>,
pub openalex_id: String,
pub arxiv: Option<String>,
pub title: String,
pub authors: Vec<String>,
pub year: Option<i32>,
pub venue: Option<String>,
#[serde(rename = "abstract")]
pub abstract_: Option<String>,
pub cited_by_count: u64,
pub oa_status: Option<String>,
pub source: DiscoverySource,
}
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperSearchResults {
pub results: Vec<PaperHit>,
pub total_results: Option<u64>,
}
#[derive(Debug, Default)]
struct ResolvedIds {
author: Option<String>,
source: Option<String>,
publisher: Option<String>,
}
pub async fn paper_search(
base: &Url,
contact_email: &str,
query: &PaperSearchQuery,
ctx: &FetchContext,
) -> Result<PaperSearchResults, FetchError> {
let ids = ResolvedIds {
author: resolve_optional(base, contact_email, "authors", &query.author, ctx).await?,
source: resolve_optional(base, contact_email, "sources", &query.venue, ctx).await?,
publisher: resolve_optional(base, contact_email, "publishers", &query.publisher, ctx)
.await?,
};
let url = build_search_url(base, contact_email, query, &ids)?;
let (value, _bytes) = openalex_get(&url, ctx).await?;
let results_array = value
.get("results")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| missing_results_array("search", &value))?;
let results: Vec<PaperHit> = results_array.iter().map(work_to_hit).collect();
let total_results = value
.get("meta")
.and_then(|m| m.get("count"))
.and_then(serde_json::Value::as_u64);
Ok(PaperSearchResults {
results,
total_results,
})
}
async fn openalex_get(
url: &Url,
ctx: &FetchContext,
) -> Result<(serde_json::Value, usize), FetchError> {
let _permit = ctx.rate_limiter.acquire(SOURCE_KEY).await;
let (body, _final_url) = ctx.http.fetch_bytes(SOURCE_KEY, url.clone()).await?;
let value: serde_json::Value =
serde_json::from_slice(&body).map_err(|e| FetchError::SourceSchema {
hint: format!("openalex returned non-JSON: {e}"),
})?;
ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Metadata,
ref_: None,
source: Some(SOURCE_KEY),
error_code: None,
size_bytes: Some(body.len() as u64),
license: None,
store_path: None,
canonical_digest: None,
})?;
Ok((value, body.len()))
}
async fn resolve_optional(
base: &Url,
contact_email: &str,
entity_path: &str,
name: &Option<String>,
ctx: &FetchContext,
) -> Result<Option<String>, FetchError> {
match name {
Some(n) if !n.trim().is_empty() => Ok(Some(
resolve_entity_id(base, contact_email, entity_path, n, ctx).await?,
)),
_ => Ok(None),
}
}
async fn resolve_entity_id(
base: &Url,
contact_email: &str,
entity_path: &str,
name: &str,
ctx: &FetchContext,
) -> Result<String, FetchError> {
let mut url = base
.join(&format!("/{entity_path}"))
.map_err(|e| FetchError::SourceSchema {
hint: format!("openalex {entity_path} URL construction failed: {e}"),
})?;
{
let mut qp = url.query_pairs_mut();
qp.append_pair("search", name);
qp.append_pair("per-page", "5");
if !contact_email.is_empty() {
qp.append_pair("mailto", contact_email);
}
}
let (value, _len) = openalex_get(&url, ctx).await?;
let results_arr = value
.get("results")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| missing_results_array(&format!("/{entity_path}"), &value))?;
let mut candidates: Vec<Candidate> = results_arr
.iter()
.filter_map(Candidate::from_value)
.collect();
candidates.sort_by(|a, b| {
b.relevance
.partial_cmp(&a.relevance)
.unwrap_or(std::cmp::Ordering::Equal)
});
select_entity(entity_path, name, &candidates)
}
struct Candidate {
id: String,
display_name: String,
works_count: u64,
relevance: f64,
}
impl Candidate {
fn from_value(v: &serde_json::Value) -> Option<Self> {
let id = v
.get("id")
.and_then(serde_json::Value::as_str)
.map(strip_openalex_prefix)?;
Some(Self {
id,
display_name: v
.get("display_name")
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string(),
works_count: v
.get("works_count")
.and_then(serde_json::Value::as_u64)
.unwrap_or(0),
relevance: v
.get("relevance_score")
.and_then(serde_json::Value::as_f64)
.unwrap_or(0.0),
})
}
}
const DOMINANCE_RATIO: f64 = 2.0;
fn select_entity(
entity_path: &str,
name: &str,
candidates: &[Candidate],
) -> Result<String, FetchError> {
let label = entity_label(entity_path);
if candidates.is_empty() {
return Err(FetchError::NotFound {
hint: format!("no OpenAlex {label} matched '{name}'"),
});
}
if candidates.len() == 1 {
return Ok(candidates[0].id.clone());
}
let exact: Vec<&Candidate> = candidates
.iter()
.filter(|c| c.display_name.trim().eq_ignore_ascii_case(name.trim()))
.collect();
if exact.len() == 1 {
return Ok(exact[0].id.clone());
}
if exact.is_empty() {
let top = &candidates[0];
let second = &candidates[1];
if top.relevance > 0.0
&& second.relevance > 0.0
&& top.relevance >= DOMINANCE_RATIO * second.relevance
{
return Ok(top.id.clone());
}
}
Err(FetchError::Ambiguous {
hint: format_ambiguous(label, name, candidates),
})
}
fn entity_label(entity_path: &str) -> &str {
match entity_path {
"authors" => "author",
"sources" => "venue",
"publishers" => "publisher",
other => other,
}
}
fn format_ambiguous(label: &str, name: &str, candidates: &[Candidate]) -> String {
let mut s = format!(
"ambiguous {label} '{name}' — {} candidates; narrow the name \
(add a first name / fuller title) and retry:",
candidates.len()
);
for c in candidates.iter().take(5) {
s.push_str(&format!(
"\n {} ({}, {} works)",
c.display_name, c.id, c.works_count
));
}
s
}
fn build_search_url(
base: &Url,
contact_email: &str,
query: &PaperSearchQuery,
ids: &ResolvedIds,
) -> Result<Url, FetchError> {
let mut url = base.join("/works").map_err(|e| FetchError::SourceSchema {
hint: format!("openalex search URL construction failed: {e}"),
})?;
let per_page = query.limit.clamp(1, MAX_PER_PAGE);
let mut filters: Vec<String> = Vec::new();
if let Some(from) = query.from_year {
filters.push(format!("from_publication_date:{from}-01-01"));
}
if let Some(to) = query.to_year {
filters.push(format!("to_publication_date:{to}-12-31"));
}
if query.oa_only {
filters.push("is_oa:true".to_string());
}
if let Some(min) = query.min_citations {
filters.push(format!("cited_by_count:>{min}"));
}
if let Some(author_id) = &ids.author {
filters.push(format!("authorships.author.id:{author_id}"));
}
if let Some(source_id) = &ids.source {
filters.push(format!("primary_location.source.id:{source_id}"));
}
if let Some(publisher_id) = &ids.publisher {
filters.push(format!(
"primary_location.source.publisher_lineage:{publisher_id}"
));
}
{
let mut qp = url.query_pairs_mut();
qp.append_pair("search", &query.query);
qp.append_pair("per-page", &per_page.to_string());
qp.append_pair("sort", query.sort.as_openalex());
qp.append_pair("select", SELECT_FIELDS);
if !filters.is_empty() {
qp.append_pair("filter", &filters.join(","));
}
if !contact_email.is_empty() {
qp.append_pair("mailto", contact_email);
}
}
Ok(url)
}
fn work_to_hit(work: &serde_json::Value) -> PaperHit {
let openalex_id = work
.get("id")
.and_then(serde_json::Value::as_str)
.map(strip_openalex_prefix)
.unwrap_or_default();
let doi = work
.get("doi")
.and_then(serde_json::Value::as_str)
.map(strip_doi_prefix);
let title = work
.get("title")
.and_then(serde_json::Value::as_str)
.or_else(|| work.get("display_name").and_then(serde_json::Value::as_str))
.unwrap_or("")
.to_string();
let authors = work
.get("authorships")
.and_then(serde_json::Value::as_array)
.map(|arr| {
arr.iter()
.filter_map(|a| {
a.get("author")
.and_then(|au| au.get("display_name"))
.and_then(serde_json::Value::as_str)
.map(str::to_string)
})
.collect()
})
.unwrap_or_default();
let year = work
.get("publication_year")
.and_then(serde_json::Value::as_i64)
.and_then(|y| i32::try_from(y).ok());
let venue = work
.get("primary_location")
.and_then(|loc| loc.get("source"))
.and_then(|src| src.get("display_name"))
.and_then(serde_json::Value::as_str)
.map(str::to_string);
let abstract_ = work
.get("abstract_inverted_index")
.and_then(reconstruct_abstract);
let cited_by_count = work
.get("cited_by_count")
.and_then(serde_json::Value::as_u64)
.unwrap_or(0);
let oa_status = work
.get("open_access")
.and_then(|oa| oa.get("oa_status"))
.and_then(serde_json::Value::as_str)
.map(str::to_string);
let arxiv = work
.get("locations")
.and_then(serde_json::Value::as_array)
.and_then(|locs| locs.iter().find_map(extract_arxiv_from_location));
PaperHit {
doi,
openalex_id,
arxiv,
title,
authors,
year,
venue,
abstract_,
cited_by_count,
oa_status,
source: DiscoverySource::OpenAlex,
}
}
fn reconstruct_abstract(inv: &serde_json::Value) -> Option<String> {
let map = inv.as_object()?;
if map.is_empty() {
return None;
}
let mut positioned: Vec<(u64, &str)> = Vec::new();
for (word, positions) in map {
if let Some(arr) = positions.as_array() {
for p in arr {
if let Some(pos) = p.as_u64() {
positioned.push((pos, word.as_str()));
}
}
}
}
if positioned.is_empty() {
return None;
}
positioned.sort_by_key(|(pos, _)| *pos);
let words: Vec<&str> = positioned.into_iter().map(|(_, w)| w).collect();
Some(words.join(" "))
}
fn extract_arxiv_from_location(loc: &serde_json::Value) -> Option<String> {
for key in ["landing_page_url", "pdf_url"] {
if let Some(u) = loc.get(key).and_then(serde_json::Value::as_str) {
if let Some(idx) = u.find("arxiv.org/abs/") {
let after = &u[idx + "arxiv.org/abs/".len()..];
let id: String = after
.chars()
.take_while(|c| !matches!(c, '?' | '#' | '/' | ' '))
.collect();
if !id.is_empty() {
return Some(id);
}
}
}
}
None
}
fn strip_openalex_prefix(id: &str) -> String {
id.rsplit('/').next().unwrap_or(id).to_string()
}
fn strip_doi_prefix(doi_url: &str) -> String {
let lower = doi_url.to_ascii_lowercase();
lower
.strip_prefix("https://doi.org/")
.or_else(|| lower.strip_prefix("http://doi.org/"))
.unwrap_or(&lower)
.to_string()
}
fn truncate_for_hint(body: &[u8]) -> String {
const MAX: usize = 200;
let s = String::from_utf8_lossy(body);
if s.chars().count() <= MAX {
s.into_owned()
} else {
let head: String = s.chars().take(MAX).collect();
format!("{head}…")
}
}
fn missing_results_array(context: &str, value: &serde_json::Value) -> FetchError {
FetchError::SourceSchema {
hint: format!(
"openalex {context} response missing `results` array — likely an \
error payload (got: {})",
truncate_for_hint(value.to_string().as_bytes())
),
}
}
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct PaperLinks {
pub doi: Option<String>,
pub arxiv: Option<String>,
pub openalex_id: String,
pub title: String,
}
pub async fn resolve_links_for_doi(
base: &Url,
contact_email: &str,
doi: &str,
ctx: &FetchContext,
) -> Result<PaperLinks, FetchError> {
let url = build_doi_lookup_url(base, contact_email, doi)?;
let (value, _bytes) = openalex_get(&url, ctx).await?;
let results = value
.get("results")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| missing_results_array("doi-lookup", &value))?;
let work = results.first().ok_or_else(|| FetchError::NotFound {
hint: format!("no OpenAlex work matched doi '{doi}'"),
})?;
let links = work_to_links(work);
if links.openalex_id.is_empty() {
return Err(FetchError::SourceSchema {
hint: format!("openalex work for doi '{doi}' has no id"),
});
}
Ok(links)
}
fn build_doi_lookup_url(base: &Url, contact_email: &str, doi: &str) -> Result<Url, FetchError> {
let mut url = base.join("/works").map_err(|e| FetchError::SourceSchema {
hint: format!("openalex doi-lookup URL construction failed: {e}"),
})?;
{
let mut qp = url.query_pairs_mut();
qp.append_pair("filter", &format!("doi:{doi}"));
qp.append_pair("per-page", "1");
qp.append_pair(
"select",
"id,doi,title,display_name,locations,primary_location,best_oa_location",
);
if !contact_email.is_empty() {
qp.append_pair("mailto", contact_email);
}
}
Ok(url)
}
fn work_to_links(work: &serde_json::Value) -> PaperLinks {
let openalex_id = work
.get("id")
.and_then(serde_json::Value::as_str)
.map(strip_openalex_prefix)
.unwrap_or_default();
let doi = work
.get("doi")
.and_then(serde_json::Value::as_str)
.map(strip_doi_prefix);
let title = work
.get("title")
.and_then(serde_json::Value::as_str)
.or_else(|| work.get("display_name").and_then(serde_json::Value::as_str))
.unwrap_or("")
.to_string();
let arxiv = work
.get("locations")
.and_then(serde_json::Value::as_array)
.and_then(|locs| locs.iter().find_map(extract_arxiv_from_location))
.or_else(|| {
work.get("primary_location")
.and_then(extract_arxiv_from_location)
})
.or_else(|| {
work.get("best_oa_location")
.and_then(extract_arxiv_from_location)
});
PaperLinks {
doi,
arxiv,
openalex_id,
title,
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use std::sync::Arc;
use camino::Utf8PathBuf;
use tempfile::TempDir;
use wiremock::matchers::{method, path, query_param};
use wiremock::{Mock, MockServer, ResponseTemplate};
use crate::http::HttpClient;
use crate::provenance::ProvenanceLog;
use crate::rate_limiter::RateLimiter;
use crate::RateLimits;
const SAMPLE_SEARCH: &str = r#"{
"meta": { "count": 4012, "per_page": 25 },
"results": [
{
"id": "https://openalex.org/W123",
"doi": "https://doi.org/10.1234/Example",
"title": "Tropical Tensor Networks",
"display_name": "Tropical Tensor Networks",
"publication_year": 2021,
"cited_by_count": 42,
"abstract_inverted_index": { "Tropical": [0], "tensor": [1], "networks": [2] },
"authorships": [
{ "author": { "display_name": "Ada Lovelace" } },
{ "author": { "display_name": "Alan Turing" } }
],
"primary_location": { "source": { "display_name": "Phys. Rev. B" } },
"open_access": { "oa_status": "green", "is_oa": true },
"locations": [
{ "landing_page_url": "https://arxiv.org/abs/2101.12345v2" }
]
},
{
"id": "https://openalex.org/W456",
"doi": null,
"title": "Second Paper",
"publication_year": 2019,
"cited_by_count": 7,
"abstract_inverted_index": null,
"authorships": [],
"open_access": { "oa_status": "closed" }
}
]
}"#;
fn build_test_context(wiremock_host: &str) -> (TempDir, FetchContext) {
let td = TempDir::new().expect("tempdir");
let log_dir =
Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
let log_path = log_dir.join("test.jsonl");
let http = Arc::new(HttpClient::new_for_tests_allow_http(
"openalex",
wiremock_host,
));
let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
let session_id = "01J0000000000000000000TEST".to_string();
let log = Arc::new(
ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
);
let ctx = FetchContext {
http,
rate_limiter,
log,
session_id,
cache_root: None,
};
(td, ctx)
}
#[tokio::test]
async fn search_maps_works_to_hits() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("search", "tropical tensor networks"))
.and(query_param("mailto", "doiget@localhost"))
.respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_SEARCH))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let q = PaperSearchQuery::new("tropical tensor networks");
let out = paper_search(&base, "doiget@localhost", &q, &ctx)
.await
.expect("search ok");
assert_eq!(out.total_results, Some(4012));
assert_eq!(out.results.len(), 2);
let first = &out.results[0];
assert_eq!(first.openalex_id, "W123");
assert_eq!(first.doi.as_deref(), Some("10.1234/example")); assert_eq!(first.title, "Tropical Tensor Networks");
assert_eq!(first.year, Some(2021));
assert_eq!(first.cited_by_count, 42);
assert_eq!(first.abstract_.as_deref(), Some("Tropical tensor networks"));
assert_eq!(first.authors, vec!["Ada Lovelace", "Alan Turing"]);
assert_eq!(first.venue.as_deref(), Some("Phys. Rev. B"));
assert_eq!(first.oa_status.as_deref(), Some("green"));
assert_eq!(first.arxiv.as_deref(), Some("2101.12345v2"));
assert_eq!(first.source, DiscoverySource::OpenAlex);
let second = &out.results[1];
assert_eq!(second.openalex_id, "W456");
assert_eq!(second.doi, None);
assert_eq!(second.abstract_, None);
assert_eq!(second.venue, None);
assert!(second.authors.is_empty());
assert_eq!(second.oa_status.as_deref(), Some("closed"));
assert_eq!(second.arxiv, None);
}
#[tokio::test]
async fn search_filters_and_sort_land_on_the_url() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("sort", "cited_by_count:desc"))
.and(query_param(
"filter",
"from_publication_date:2020-01-01,is_oa:true,cited_by_count:>10",
))
.and(query_param("per-page", "5"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string(r#"{ "meta": { "count": 0 }, "results": [] }"#),
)
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let q = PaperSearchQuery {
query: "spin glass".to_string(),
limit: 5,
from_year: Some(2020),
to_year: None,
oa_only: true,
min_citations: Some(10),
author: None,
venue: None,
publisher: None,
sort: SearchSort::Cited,
};
let out = paper_search(&base, "doiget@localhost", &q, &ctx)
.await
.expect("search ok");
assert_eq!(out.total_results, Some(0));
assert!(out.results.is_empty());
}
#[tokio::test]
async fn search_error_payload_is_source_schema() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string(r#"{"error":"Invalid query parameters"}"#),
)
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let q = PaperSearchQuery::new("anything");
let err = paper_search(&base, "", &q, &ctx)
.await
.expect_err("missing `results` must surface as SourceSchema");
assert!(matches!(err, FetchError::SourceSchema { .. }));
}
#[test]
fn name_filters_compose_into_resolved_ids() {
let base = Url::parse("https://api.openalex.org").expect("base parses");
let q = PaperSearchQuery::new("topic");
let ids = ResolvedIds {
author: Some("A1".to_string()),
source: Some("S2".to_string()),
publisher: Some("P3".to_string()),
};
let url = build_search_url(&base, "", &q, &ids).expect("url builds");
let filter = url
.query_pairs()
.find(|(k, _)| k == "filter")
.map(|(_, v)| v.into_owned())
.expect("filter param present");
assert!(filter.contains("authorships.author.id:A1"), "got {filter}");
assert!(
filter.contains("primary_location.source.id:S2"),
"got {filter}"
);
assert!(
filter.contains("primary_location.source.publisher_lineage:P3"),
"got {filter}"
);
assert!(
param(&url, "mailto").is_none(),
"empty contact email must omit mailto"
);
}
#[tokio::test]
async fn venue_name_resolves_to_source_id_then_filters_works() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/sources"))
.and(query_param("search", "Physical Review B"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "results": [ { "id": "https://openalex.org/S99", "display_name": "Physical Review B" } ] }"#,
))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("filter", "primary_location.source.id:S99"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W1", "title": "In PRB" } ] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("spin glass");
q.venue = Some("Physical Review B".to_string());
let out = paper_search(&base, "", &q, &ctx)
.await
.expect("venue-filtered search ok");
assert_eq!(out.total_results, Some(1));
assert_eq!(out.results.len(), 1);
assert_eq!(out.results[0].openalex_id, "W1");
}
#[tokio::test]
async fn unresolvable_venue_name_is_not_found() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/sources"))
.respond_with(ResponseTemplate::new(200).set_body_string(r#"{ "results": [] }"#))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("spin glass");
q.venue = Some("No Such Journal".to_string());
let err = paper_search(&base, "", &q, &ctx)
.await
.expect_err("an unresolvable venue name must error, not silently drop the filter");
assert!(matches!(err, FetchError::NotFound { .. }), "got {err:?}");
}
#[tokio::test]
async fn entity_error_envelope_is_source_schema_not_not_found() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/authors"))
.respond_with(
ResponseTemplate::new(200).set_body_string(r#"{"error":"rate limit exceeded"}"#),
)
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("x");
q.author = Some("Parisi".to_string());
let err = paper_search(&base, "", &q, &ctx)
.await
.expect_err("an entity error envelope must be SourceSchema, not NotFound");
assert!(
matches!(err, FetchError::SourceSchema { .. }),
"got {err:?}"
);
}
#[tokio::test]
async fn exact_name_match_resolves_amid_namesakes() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/sources"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "results": [
{ "id": "https://openalex.org/S1", "display_name": "Physical Review B", "works_count": 50000, "relevance_score": 80.0 },
{ "id": "https://openalex.org/S2", "display_name": "Physical Review B: Condensed Matter", "works_count": 1000, "relevance_score": 78.0 },
{ "id": "https://openalex.org/S3", "display_name": "Reviews of Physics", "works_count": 200, "relevance_score": 70.0 }
] }"#,
))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("filter", "primary_location.source.id:S1"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W1", "title": "x" } ] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("spin glass");
q.venue = Some("Physical Review B".to_string());
let out = paper_search(&base, "", &q, &ctx)
.await
.expect("exact venue name must resolve to S1 amid namesakes");
assert_eq!(out.results[0].openalex_id, "W1");
}
#[tokio::test]
async fn dominant_top_hit_resolves_for_vague_name() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/authors"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "results": [
{ "id": "https://openalex.org/A1", "display_name": "Giorgio Parisi", "works_count": 400, "relevance_score": 100.0 },
{ "id": "https://openalex.org/A2", "display_name": "M. Parisi", "works_count": 10, "relevance_score": 20.0 }
] }"#,
))
.mount(&server)
.await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("filter", "authorships.author.id:A1"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "meta": { "count": 1 }, "results": [ { "id": "https://openalex.org/W9", "title": "y" } ] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("replica symmetry breaking");
q.author = Some("parisi".to_string());
let out = paper_search(&base, "", &q, &ctx)
.await
.expect("a dominant top hit must resolve a vague name");
assert_eq!(out.results[0].openalex_id, "W9");
}
#[tokio::test]
async fn ambiguous_name_errors_with_candidate_listing() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/authors"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "results": [
{ "id": "https://openalex.org/A1", "display_name": "John Smith", "works_count": 300, "relevance_score": 50.0 },
{ "id": "https://openalex.org/A2", "display_name": "Jane Smith", "works_count": 280, "relevance_score": 45.0 }
] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let mut q = PaperSearchQuery::new("electrons");
q.author = Some("Smith".to_string());
let err = paper_search(&base, "", &q, &ctx)
.await
.expect_err("a close, non-exact multi-match must be reported as ambiguous");
match err {
FetchError::Ambiguous { hint } => {
assert!(hint.contains("John Smith"), "hint lists candidates: {hint}");
assert!(hint.contains("Jane Smith"), "hint lists candidates: {hint}");
}
other => panic!("expected Ambiguous, got {other:?}"),
}
}
#[test]
fn abstract_reconstruction_orders_by_position() {
let inv = serde_json::json!({
"world": [1],
"hello": [0],
"again": [3],
"hello2": [2]
});
assert_eq!(
reconstruct_abstract(&inv).as_deref(),
Some("hello world hello2 again")
);
assert_eq!(reconstruct_abstract(&serde_json::Value::Null), None);
assert_eq!(reconstruct_abstract(&serde_json::json!({})), None);
}
#[test]
fn doi_and_openalex_prefixes_are_stripped() {
assert_eq!(
strip_doi_prefix("https://doi.org/10.1234/ABC"),
"10.1234/abc"
);
assert_eq!(strip_openalex_prefix("https://openalex.org/W999"), "W999");
}
#[tokio::test]
async fn doi_lookup_extracts_arxiv_preprint() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.and(query_param("filter", "doi:10.1103/physrevb.1"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "meta": { "count": 1 }, "results": [ {
"id": "https://openalex.org/W55",
"doi": "https://doi.org/10.1103/PhysRevB.1",
"title": "Published Version",
"locations": [
{ "landing_page_url": "https://journals.aps.org/prb/abstract/x" },
{ "pdf_url": "https://arxiv.org/abs/2101.54321v2" }
]
} ] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let links = resolve_links_for_doi(&base, "", "10.1103/physrevb.1", &ctx)
.await
.expect("doi lookup ok");
assert_eq!(links.openalex_id, "W55");
assert_eq!(links.doi.as_deref(), Some("10.1103/physrevb.1")); assert_eq!(links.arxiv.as_deref(), Some("2101.54321v2"));
assert_eq!(links.title, "Published Version");
}
#[tokio::test]
async fn doi_lookup_without_arxiv_location_is_none() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.respond_with(ResponseTemplate::new(200).set_body_string(
r#"{ "meta": { "count": 1 }, "results": [ {
"id": "https://openalex.org/W7",
"doi": "https://doi.org/10.1234/closed",
"title": "No Preprint",
"locations": [ { "landing_page_url": "https://example.com/x" } ]
} ] }"#,
))
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("uri");
let links = resolve_links_for_doi(&base, "", "10.1234/closed", &ctx)
.await
.expect("ok");
assert_eq!(links.arxiv, None);
assert_eq!(links.openalex_id, "W7");
}
#[tokio::test]
async fn doi_lookup_unknown_doi_is_not_found() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path("/works"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string(r#"{ "meta": { "count": 0 }, "results": [] }"#),
)
.mount(&server)
.await;
let (_td, ctx) = build_test_context(&server.uri());
let base = Url::parse(&server.uri()).expect("uri");
let err = resolve_links_for_doi(&base, "", "10.0000/nope", &ctx)
.await
.expect_err("an unmatched doi must be NotFound");
assert!(matches!(err, FetchError::NotFound { .. }), "got {err:?}");
}
#[test]
fn doi_lookup_url_preserves_input_doi_case() {
let base = Url::parse("https://api.openalex.org").expect("base");
let u = build_doi_lookup_url(&base, "", "10.1103/PhysRevB.1").expect("url");
assert_eq!(
param(&u, "filter").as_deref(),
Some("doi:10.1103/PhysRevB.1"),
"the input DOI case must be carried through verbatim"
);
}
#[test]
fn doi_lookup_url_carries_filter_and_select() {
let base = Url::parse("https://api.openalex.org").expect("base");
let u = build_doi_lookup_url(&base, "", "10.1/x").expect("url");
assert_eq!(
param(&u, "filter").as_deref(),
Some("doi:10.1/x"),
"doi filter must be url-encoded into the query"
);
assert!(param(&u, "select")
.unwrap_or_default()
.contains("locations"));
assert_eq!(param(&u, "per-page").as_deref(), Some("1"));
}
fn param(u: &Url, key: &str) -> Option<String> {
u.query_pairs()
.find(|(k, _)| k == key)
.map(|(_, v)| v.into_owned())
}
#[test]
fn per_page_clamps_to_floor_and_ceiling() {
let base = Url::parse("https://api.openalex.org").expect("base");
let mut q = PaperSearchQuery::new("x");
q.limit = 0;
let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
assert_eq!(param(&u, "per-page").as_deref(), Some("1"), "limit 0 -> 1");
q.limit = 201;
let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
assert_eq!(
param(&u, "per-page").as_deref(),
Some("200"),
"limit 201 -> 200"
);
}
#[test]
fn to_year_and_recent_sort_land_on_url() {
let base = Url::parse("https://api.openalex.org").expect("base");
let mut q = PaperSearchQuery::new("x");
q.to_year = Some(2023);
q.sort = SearchSort::Recent;
let u = build_search_url(&base, "", &q, &ResolvedIds::default()).expect("url");
assert_eq!(param(&u, "sort").as_deref(), Some("publication_date:desc"));
assert!(
param(&u, "filter")
.unwrap_or_default()
.contains("to_publication_date:2023-12-31"),
"to_year must map to to_publication_date:<y>-12-31"
);
}
fn cand(id: &str, name: &str, works: u64, rel: f64) -> Candidate {
Candidate {
id: id.to_string(),
display_name: name.to_string(),
works_count: works,
relevance: rel,
}
}
#[test]
fn dominance_at_exactly_2x_resolves_top() {
let c = vec![cand("A1", "x", 1, 2.0), cand("A2", "y", 1, 1.0)];
assert_eq!(select_entity("authors", "q", &c).expect("resolves"), "A1");
}
#[test]
fn dominance_just_below_2x_is_ambiguous() {
let c = vec![cand("A1", "x", 1, 1.9), cand("A2", "y", 1, 1.0)];
assert!(matches!(
select_entity("authors", "q", &c),
Err(FetchError::Ambiguous { .. })
));
}
#[test]
fn zero_relevance_runner_up_is_ambiguous_not_auto_top() {
let c = vec![cand("A1", "x", 1, 5.0), cand("A2", "y", 1, 0.0)];
assert!(matches!(
select_entity("authors", "q", &c),
Err(FetchError::Ambiguous { .. })
));
}
#[test]
fn multiple_exact_name_matches_are_ambiguous() {
let c = vec![cand("S1", "Dup", 9, 5.0), cand("S2", "Dup", 1, 1.0)];
assert!(matches!(
select_entity("sources", "Dup", &c),
Err(FetchError::Ambiguous { .. })
));
}
#[test]
fn arxiv_extracted_from_pdf_url_when_landing_absent() {
let loc = serde_json::json!({ "pdf_url": "https://arxiv.org/abs/2302.00001v3" });
assert_eq!(
extract_arxiv_from_location(&loc).as_deref(),
Some("2302.00001v3")
);
}
#[test]
fn arxiv_id_stops_at_query_string() {
let loc =
serde_json::json!({ "landing_page_url": "https://arxiv.org/abs/2101.12345?utm=x" });
assert_eq!(
extract_arxiv_from_location(&loc).as_deref(),
Some("2101.12345")
);
}
#[test]
fn truncate_for_hint_is_char_boundary_safe() {
let body = "あ".repeat(300);
let out = truncate_for_hint(body.as_bytes());
assert!(out.ends_with('…'));
assert_eq!(out.chars().filter(|&c| c == 'あ').count(), 200);
}
#[test]
fn ambiguous_has_its_own_wire_code() {
let e = FetchError::Ambiguous { hint: "x".into() };
assert_eq!(crate::ErrorCode::from(&e), crate::ErrorCode::Ambiguous);
assert_eq!(crate::ErrorCode::Ambiguous.as_wire(), "AMBIGUOUS");
}
#[test]
fn validate_rejects_bad_shape_and_accepts_good() {
let mut q = PaperSearchQuery::new("topic");
assert!(q.validate().is_ok());
q.query = " ".to_string();
assert!(q.validate().unwrap_err().contains("empty"));
let mut q = PaperSearchQuery::new("topic");
q.limit = 0;
assert!(q.validate().unwrap_err().contains("limit"));
q.limit = MAX_PER_PAGE + 1;
assert!(q.validate().unwrap_err().contains("limit"));
let mut q = PaperSearchQuery::new("topic");
q.from_year = Some(2025);
q.to_year = Some(2010);
assert!(q.validate().unwrap_err().contains("after"));
q.to_year = Some(2025);
assert!(q.validate().is_ok());
}
}