#![warn(missing_docs)]
#![forbid(unsafe_code)]
use serde::{Deserialize, Serialize};
use sha2::Digest;
pub mod canonical;
pub mod dry_run;
pub mod http;
pub mod orchestrator;
pub mod provenance;
pub mod rate_limiter;
pub mod refs;
pub mod source;
pub mod sources;
pub mod store;
pub mod user_extension;
#[cfg(feature = "citation")]
pub mod citation_graph;
pub use crate::canonical::{CanonicalRef, SourceType};
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub const SCHEMA_VERSION: &str = "1.0";
pub const MAX_CONCURRENT_FETCHES: u32 = 5;
pub const MAX_FETCHES_PER_SECOND: f32 = 5.0;
pub const MCP_BATCH_MAX_SIZE: usize = 100;
pub const MAX_BATCH_REFS: usize = MCP_BATCH_MAX_SIZE;
pub const MCP_QUEUE_DEPTH_MAX: usize = 100;
pub const MCP_STDIN_EOF_SHUTDOWN_SEC: u64 = 5;
pub const DOI_SUFFIX_MAX_LEN: usize = 256;
pub const PDF_MAX_BYTES: u64 = 100_000_000;
pub const RESOLVER_CACHE_TTL_DAYS: u32 = 7;
pub const CITATION_CACHE_TTL_DAYS: u32 = 30;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "lowercase", tag = "kind", content = "id")]
pub enum Ref {
Doi(Doi),
Arxiv(ArxivId),
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Doi(pub(crate) String);
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct ArxivId(pub(crate) String);
impl Doi {
pub fn as_str(&self) -> &str {
&self.0
}
pub fn parse(s: &str) -> Result<Self, RefParseError> {
let stripped = parse::strip_doi_scheme(s);
parse::validate_doi(stripped)?;
Ok(Doi(stripped.to_string()))
}
}
impl ArxivId {
pub fn as_str(&self) -> &str {
&self.0
}
pub fn parse(s: &str) -> Result<Self, RefParseError> {
let stripped = parse::strip_arxiv_scheme(s);
parse::validate_arxiv(stripped)?;
Ok(ArxivId(stripped.to_string()))
}
}
impl Ref {
pub fn parse(s: &str) -> Result<Self, RefParseError> {
if s.is_empty() {
return Err(RefParseError::Empty);
}
if parse::has_doi_scheme(s) {
return Doi::parse(s).map(Ref::Doi);
}
if parse::has_arxiv_scheme(s) {
return ArxivId::parse(s).map(Ref::Arxiv);
}
if s.starts_with("10.") {
return Doi::parse(s).map(Ref::Doi);
}
ArxivId::parse(s).map(Ref::Arxiv)
}
}
mod parse {
use super::{RefParseError, DOI_SUFFIX_MAX_LEN};
pub(crate) fn has_doi_scheme(s: &str) -> bool {
s.len() >= 4 && s.is_char_boundary(4) && s[..4].eq_ignore_ascii_case("doi:")
}
pub(crate) fn has_arxiv_scheme(s: &str) -> bool {
s.len() >= 6 && s.is_char_boundary(6) && s[..6].eq_ignore_ascii_case("arxiv:")
}
pub(crate) fn strip_doi_scheme(s: &str) -> &str {
if has_doi_scheme(s) {
&s[4..]
} else {
s
}
}
pub(crate) fn strip_arxiv_scheme(s: &str) -> &str {
if has_arxiv_scheme(s) {
&s[6..]
} else {
s
}
}
fn is_doi_suffix_char(c: char) -> bool {
matches!(c,
'A'..='Z' | 'a'..='z' | '0'..='9'
| '.' | '_' | '/' | '(' | ')' | '-' | ':'
)
}
pub(crate) fn validate_doi(s: &str) -> Result<(), RefParseError> {
if s.is_empty() {
return Err(RefParseError::Empty);
}
let rest = s
.strip_prefix("10.")
.ok_or(RefParseError::MissingDoiPrefix)?;
let slash_idx = rest
.find('/')
.ok_or(RefParseError::MissingDoiSuffixSeparator)?;
let registrant = &rest[..slash_idx];
let suffix = &rest[slash_idx + 1..];
if registrant.len() < 4
|| registrant.len() > 9
|| !registrant.chars().all(|c| c.is_ascii_digit())
{
return Err(RefParseError::InvalidDoiRegistrant);
}
if suffix.is_empty() {
return Err(RefParseError::EmptyDoiSuffix);
}
if suffix.len() > DOI_SUFFIX_MAX_LEN {
return Err(RefParseError::DoiSuffixTooLong {
len: suffix.len(),
max: DOI_SUFFIX_MAX_LEN,
});
}
if let Some(bad) = suffix.chars().find(|c| !is_doi_suffix_char(*c)) {
return Err(RefParseError::InvalidDoiSuffixChar { ch: bad });
}
Ok(())
}
pub(crate) fn validate_arxiv(s: &str) -> Result<(), RefParseError> {
if s.is_empty() {
return Err(RefParseError::Empty);
}
if validate_arxiv_new(s).is_ok() || validate_arxiv_old(s).is_ok() {
return Ok(());
}
Err(RefParseError::InvalidArxivShape)
}
fn validate_arxiv_new(s: &str) -> Result<(), ()> {
let dot_idx = s.find('.').ok_or(())?;
let head = &s[..dot_idx];
let tail = &s[dot_idx + 1..];
if head.len() != 4 || !head.chars().all(|c| c.is_ascii_digit()) {
return Err(());
}
let bytes = tail.as_bytes();
let mut i = 0;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
let digits_len = i;
if !(4..=5).contains(&digits_len) {
return Err(());
}
if i == bytes.len() {
return Ok(());
}
if bytes[i] != b'v' {
return Err(());
}
i += 1;
let v_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == v_start || i != bytes.len() {
return Err(());
}
Ok(())
}
fn validate_arxiv_old(s: &str) -> Result<(), ()> {
let slash_idx = s.find('/').ok_or(())?;
let class = &s[..slash_idx];
let id = &s[slash_idx + 1..];
let (core_class, dot_part) = match class.find('.') {
Some(d) => (&class[..d], Some(&class[d + 1..])),
None => (class, None),
};
if core_class.is_empty()
|| !core_class
.chars()
.all(|c| c.is_ascii_lowercase() || c == '-')
|| core_class.starts_with('-')
|| core_class.ends_with('-')
{
return Err(());
}
if let Some(dp) = dot_part {
if dp.len() != 2 || !dp.chars().all(|c| c.is_ascii_uppercase()) {
return Err(());
}
}
let bytes = id.as_bytes();
let mut i = 0;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i != 7 {
return Err(());
}
if i == bytes.len() {
return Ok(());
}
if bytes[i] != b'v' {
return Err(());
}
i += 1;
let v_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == v_start || i != bytes.len() {
return Err(());
}
Ok(())
}
}
#[derive(Debug, Clone, PartialEq, Eq, thiserror::Error)]
#[non_exhaustive]
pub enum RefParseError {
#[error("empty input")]
Empty,
#[error("DOI must begin with '10.'")]
MissingDoiPrefix,
#[error("DOI must contain '/' between registrant and suffix")]
MissingDoiSuffixSeparator,
#[error("DOI registrant must be 4–9 ASCII digits")]
InvalidDoiRegistrant,
#[error("DOI suffix is empty")]
EmptyDoiSuffix,
#[error("DOI suffix is {len} bytes; maximum is {max}")]
DoiSuffixTooLong {
len: usize,
max: usize,
},
#[error("DOI suffix contains invalid character {ch:?}")]
InvalidDoiSuffixChar {
ch: char,
},
#[error("input does not match any known arXiv id shape")]
InvalidArxivShape,
}
impl From<RefParseError> for ErrorCode {
fn from(_: RefParseError) -> Self {
ErrorCode::InvalidRef
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(transparent)]
pub struct Safekey(pub(crate) String);
impl Safekey {
pub fn as_str(&self) -> &str {
&self.0
}
}
impl Ref {
pub fn as_input_str(&self) -> &str {
match self {
Ref::Doi(d) => d.as_str(),
Ref::Arxiv(a) => a.as_str(),
}
}
pub fn safekey(&self) -> Safekey {
let raw = match self {
Ref::Doi(d) => format!("doi_{}", d.as_str()),
Ref::Arxiv(a) => format!("arxiv_{}", a.as_str()),
};
let escaped: String = raw
.chars()
.map(|c| match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '.' | '-' | '_' => c,
_ => '_',
})
.collect();
let mut collapsed = String::with_capacity(escaped.len());
let mut last_was_underscore = false;
for c in escaped.chars() {
if c == '_' {
if !last_was_underscore {
collapsed.push('_');
}
last_was_underscore = true;
} else {
collapsed.push(c);
last_was_underscore = false;
}
}
let trimmed = collapsed.trim_matches('_');
let key = if trimmed.len() > 192 {
let digest = sha2::Sha256::digest(raw.as_bytes());
let hash = hex::encode(&digest[..4]);
format!("{}_{}", &trimmed[..192], hash)
} else {
trimmed.to_string()
};
Safekey(key)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
#[non_exhaustive]
pub enum ErrorCode {
InvalidRef,
NoOaAvailable,
RateLimited,
NetworkError,
StoreError,
LogError,
CapabilityDenied,
FetchTimeout,
SchemaTooNew,
LockTimeout,
InternalError,
NotImplemented,
}
impl ErrorCode {
#[must_use]
pub fn as_wire(&self) -> &'static str {
match self {
ErrorCode::InvalidRef => "INVALID_REF",
ErrorCode::NoOaAvailable => "NO_OA_AVAILABLE",
ErrorCode::RateLimited => "RATE_LIMITED",
ErrorCode::NetworkError => "NETWORK_ERROR",
ErrorCode::StoreError => "STORE_ERROR",
ErrorCode::LogError => "LOG_ERROR",
ErrorCode::CapabilityDenied => "CAPABILITY_DENIED",
ErrorCode::FetchTimeout => "FETCH_TIMEOUT",
ErrorCode::SchemaTooNew => "SCHEMA_TOO_NEW",
ErrorCode::LockTimeout => "LOCK_TIMEOUT",
ErrorCode::InternalError => "INTERNAL_ERROR",
ErrorCode::NotImplemented => "NOT_IMPLEMENTED",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DenialReason {
RedirectNotInAllowlist,
InsecureScheme,
HostInBlockList,
SizeCapExceeded,
SchemaDrift,
CapabilityNotGranted,
RateLimitWindow,
SsrfPrivateAddress,
ContentTypeMismatch,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct DenialContext {
pub reason: DenialReason,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub source: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub attempted: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub expected: Option<Vec<String>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hop_index: Option<u8>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub cap: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub actual: Option<u64>,
}
#[derive(Debug, Clone, Copy)]
pub struct AlwaysOn;
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct MetadataAccess {
pub openalex: bool,
pub semantic_scholar: bool,
pub doaj: bool,
}
#[derive(Debug, Clone, Copy)]
#[non_exhaustive]
pub struct RateLimits {
pub(crate) max_concurrent_fetches: u32,
pub(crate) max_fetches_per_second: f32,
pub(crate) per_source_backoff_ms: u64,
}
impl RateLimits {
pub const HARD_CODED: Self = Self {
max_concurrent_fetches: MAX_CONCURRENT_FETCHES,
max_fetches_per_second: MAX_FETCHES_PER_SECOND,
per_source_backoff_ms: 200,
};
pub const fn max_concurrent_fetches(&self) -> u32 {
self.max_concurrent_fetches
}
pub const fn max_fetches_per_second(&self) -> f32 {
self.max_fetches_per_second
}
pub const fn per_source_backoff_ms(&self) -> u64 {
self.per_source_backoff_ms
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct TdmGrant {
#[cfg(any(
feature = "tdm-elsevier",
feature = "tdm-aps",
feature = "tdm-springer"
))]
pub api_key: secrecy::SecretString,
pub agree_env_var: String,
pub agreed_at: chrono::DateTime<chrono::Utc>,
}
impl Default for TdmGrant {
fn default() -> Self {
Self {
#[cfg(any(
feature = "tdm-elsevier",
feature = "tdm-aps",
feature = "tdm-springer"
))]
api_key: secrecy::SecretString::from(String::new()),
agree_env_var: String::new(),
agreed_at: chrono::Utc::now(),
}
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct CapabilityProfile {
pub oa: AlwaysOn,
pub metadata: MetadataAccess,
pub tdm_elsevier: Option<TdmGrant>,
pub tdm_aps: Option<TdmGrant>,
pub tdm_springer: Option<TdmGrant>,
pub rate_limits: RateLimits,
}
#[derive(Debug, thiserror::Error)]
pub enum CapabilityError {
#[error("env {agree_var} is set but {key_var} is missing")]
AgreedButNoKey {
agree_var: String,
key_var: String,
},
#[error("key for {agree_var} is present but {agree_var} is not set to '1'")]
KeyButNotAgreed {
agree_var: String,
},
}
impl CapabilityProfile {
pub fn from_env() -> Result<Self, CapabilityError> {
let metadata = MetadataAccess {
openalex: resolve_metadata_flag(
"DOIGET_ENABLE_OPENALEX",
"metadata",
cfg!(feature = "metadata"),
),
semantic_scholar: resolve_metadata_flag(
"DOIGET_ENABLE_S2",
"metadata",
cfg!(feature = "metadata"),
),
doaj: resolve_metadata_flag(
"DOIGET_ENABLE_DOAJ",
"metadata",
cfg!(feature = "metadata"),
),
};
let tdm_elsevier = resolve_tdm_grant(
"DOIGET_AGREE_TDM_ELSEVIER",
"DOIGET_KEY_ELSEVIER",
"tdm-elsevier",
cfg!(feature = "tdm-elsevier"),
)?;
let tdm_aps = resolve_tdm_grant(
"DOIGET_AGREE_TDM_APS",
"DOIGET_KEY_APS",
"tdm-aps",
cfg!(feature = "tdm-aps"),
)?;
let tdm_springer = resolve_tdm_grant(
"DOIGET_AGREE_TDM_SPRINGER",
"DOIGET_KEY_SPRINGER",
"tdm-springer",
cfg!(feature = "tdm-springer"),
)?;
Ok(Self {
oa: AlwaysOn,
metadata,
tdm_elsevier,
tdm_aps,
tdm_springer,
rate_limits: RateLimits::HARD_CODED,
})
}
}
fn resolve_metadata_flag(env_var: &str, feature: &str, feature_enabled: bool) -> bool {
let env_set = std::env::var_os(env_var).is_some();
match (env_set, feature_enabled) {
(true, true) => true,
(true, false) => {
tracing::warn!(
env_var,
feature,
"{} is set but feature {} was not compiled in; the source will be unavailable",
env_var,
feature
);
false
}
(false, _) => false,
}
}
fn resolve_tdm_grant(
agree_var: &str,
key_var: &str,
feature: &str,
feature_enabled: bool,
) -> Result<Option<TdmGrant>, CapabilityError> {
let agree_raw = std::env::var(agree_var).ok();
let agreed = matches!(agree_raw.as_deref(), Some("1"));
let agree_present = agree_raw.is_some();
let key_value = std::env::var(key_var).ok().filter(|v| !v.is_empty());
match (agreed, agree_present, key_value) {
(true, _, Some(key)) => {
if feature_enabled {
Ok(Some(build_tdm_grant(agree_var, key)))
} else {
let _ = key;
tracing::warn!(
env_var = agree_var,
feature,
"{} is set but feature {} was not compiled in; the source will be unavailable",
agree_var,
feature
);
Ok(None)
}
}
(true, _, None) => Err(CapabilityError::AgreedButNoKey {
agree_var: agree_var.to_string(),
key_var: key_var.to_string(),
}),
(false, true, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
agree_var: agree_var.to_string(),
}),
(false, false, Some(_)) => Err(CapabilityError::KeyButNotAgreed {
agree_var: agree_var.to_string(),
}),
(false, true, None) => Ok(None),
(false, false, None) => Ok(None),
}
}
fn build_tdm_grant(agree_var: &str, key: String) -> TdmGrant {
#[cfg(any(
feature = "tdm-elsevier",
feature = "tdm-aps",
feature = "tdm-springer"
))]
{
TdmGrant {
api_key: secrecy::SecretString::from(key),
agree_env_var: agree_var.to_string(),
agreed_at: chrono::Utc::now(),
}
}
#[cfg(not(any(
feature = "tdm-elsevier",
feature = "tdm-aps",
feature = "tdm-springer"
)))]
{
let _ = key;
TdmGrant {
agree_env_var: agree_var.to_string(),
agreed_at: chrono::Utc::now(),
}
}
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
#[test]
fn rate_limits_hard_coded_match_legal_safeguards() {
assert_eq!(RateLimits::HARD_CODED.max_concurrent_fetches(), 5);
assert!((RateLimits::HARD_CODED.max_fetches_per_second() - 5.0).abs() < f32::EPSILON);
assert_eq!(RateLimits::HARD_CODED.per_source_backoff_ms(), 200);
}
#[test]
fn batch_size_caps_match_security_doc() {
assert_eq!(MCP_BATCH_MAX_SIZE, 100);
assert_eq!(MCP_QUEUE_DEPTH_MAX, 100);
assert_eq!(DOI_SUFFIX_MAX_LEN, 256);
assert_eq!(MCP_STDIN_EOF_SHUTDOWN_SEC, 5);
assert_eq!(MAX_BATCH_REFS, MCP_BATCH_MAX_SIZE);
}
#[test]
fn schema_version_is_pinned_to_1_0() {
assert_eq!(SCHEMA_VERSION, "1.0");
}
struct EnvGuard {
var: &'static str,
prior: Option<std::ffi::OsString>,
}
impl EnvGuard {
fn unset(var: &'static str) -> Self {
let prior = std::env::var_os(var);
std::env::remove_var(var);
EnvGuard { var, prior }
}
fn set(var: &'static str, value: &str) -> Self {
let prior = std::env::var_os(var);
std::env::set_var(var, value);
EnvGuard { var, prior }
}
}
impl Drop for EnvGuard {
fn drop(&mut self) {
match &self.prior {
Some(v) => std::env::set_var(self.var, v),
None => std::env::remove_var(self.var),
}
}
}
fn unset_all_capability_env_vars() -> Vec<EnvGuard> {
[
"DOIGET_ENABLE_OPENALEX",
"DOIGET_ENABLE_S2",
"DOIGET_ENABLE_DOAJ",
"DOIGET_AGREE_TDM_ELSEVIER",
"DOIGET_KEY_ELSEVIER",
"DOIGET_AGREE_TDM_APS",
"DOIGET_KEY_APS",
"DOIGET_AGREE_TDM_SPRINGER",
"DOIGET_KEY_SPRINGER",
]
.iter()
.map(|v| EnvGuard::unset(v))
.collect()
}
#[test]
#[serial_test::serial]
fn from_env_no_env_vars_set_returns_tier_1_only() {
let _g = unset_all_capability_env_vars();
let p = CapabilityProfile::from_env().expect("clean env never errors");
assert!(p.tdm_elsevier.is_none());
assert!(p.tdm_aps.is_none());
assert!(p.tdm_springer.is_none());
assert!(!p.metadata.openalex);
assert!(!p.metadata.semantic_scholar);
assert!(!p.metadata.doaj);
assert_eq!(p.rate_limits.max_concurrent_fetches(), 5);
}
#[test]
#[serial_test::serial]
fn from_env_no_tdm_returns_tier_1_profile() {
let _g = unset_all_capability_env_vars();
let p = CapabilityProfile::from_env().expect("no TDM env -> Ok");
assert!(p.tdm_elsevier.is_none());
assert!(p.tdm_aps.is_none());
assert!(p.tdm_springer.is_none());
}
#[test]
#[serial_test::serial]
fn from_env_agreed_but_no_key_errs() {
let _g = unset_all_capability_env_vars();
let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
let result = CapabilityProfile::from_env();
match result {
Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
}
other => panic!("expected AgreedButNoKey, got {:?}", other),
}
}
#[test]
#[serial_test::serial]
fn from_env_agreed_but_empty_key_errs() {
let _g = unset_all_capability_env_vars();
let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
let result = CapabilityProfile::from_env();
match result {
Err(CapabilityError::AgreedButNoKey { agree_var, key_var }) => {
assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
assert_eq!(key_var, "DOIGET_KEY_ELSEVIER");
}
other => panic!("expected AgreedButNoKey for empty key, got {:?}", other),
}
}
#[test]
#[serial_test::serial]
fn from_env_empty_key_without_agree_is_no_grant() {
let _g = unset_all_capability_env_vars();
let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "");
let p = CapabilityProfile::from_env()
.expect("empty key + agree unset must be Ok(None), not an error");
assert!(
p.tdm_elsevier.is_none(),
"empty DOIGET_KEY_ELSEVIER with no agree var must yield no grant"
);
assert!(p.tdm_aps.is_none());
assert!(p.tdm_springer.is_none());
}
#[test]
#[serial_test::serial]
fn from_env_key_but_not_agreed_errs() {
let _g = unset_all_capability_env_vars();
let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
let result = CapabilityProfile::from_env();
match result {
Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
}
other => panic!("expected KeyButNotAgreed, got {:?}", other),
}
}
#[test]
#[serial_test::serial]
fn from_env_agree_not_one_errs() {
let _g = unset_all_capability_env_vars();
let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "true");
let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
let result = CapabilityProfile::from_env();
match result {
Err(CapabilityError::KeyButNotAgreed { agree_var }) => {
assert_eq!(agree_var, "DOIGET_AGREE_TDM_ELSEVIER");
}
other => panic!("expected KeyButNotAgreed, got {:?}", other),
}
}
#[test]
#[serial_test::serial]
fn from_env_both_set_correctly_returns_grant() {
let _g = unset_all_capability_env_vars();
let _agree = EnvGuard::set("DOIGET_AGREE_TDM_ELSEVIER", "1");
let _key = EnvGuard::set("DOIGET_KEY_ELSEVIER", "sk-test");
let p = CapabilityProfile::from_env().expect("agree=1 + key -> Ok");
if cfg!(feature = "tdm-elsevier") {
let grant = p
.tdm_elsevier
.as_ref()
.expect("feature tdm-elsevier compiled in -> Some(TdmGrant)");
assert_eq!(grant.agree_env_var, "DOIGET_AGREE_TDM_ELSEVIER");
#[cfg(any(
feature = "tdm-elsevier",
feature = "tdm-aps",
feature = "tdm-springer"
))]
{
use secrecy::ExposeSecret as _;
assert_eq!(
grant.api_key.expose_secret(),
"sk-test",
"the DOIGET_KEY_ELSEVIER value must be threaded into \
TdmGrant::api_key (issue #153)"
);
}
} else {
assert!(
p.tdm_elsevier.is_none(),
"feature tdm-elsevier NOT compiled in -> None (warn-and-skip)"
);
}
}
#[test]
#[serial_test::serial]
fn from_env_metadata_env_warns_without_feature() {
let _g = unset_all_capability_env_vars();
let _enable = EnvGuard::set("DOIGET_ENABLE_OPENALEX", "1");
let p = CapabilityProfile::from_env().expect("metadata env never errors");
if cfg!(feature = "metadata") {
assert!(p.metadata.openalex);
} else {
assert!(!p.metadata.openalex);
}
}
#[derive(Deserialize)]
struct SafekeyVector {
input: String,
expected: String,
}
#[derive(Deserialize)]
struct SafekeyVectorFile {
vectors: Vec<SafekeyVector>,
}
fn ref_from_vector_input(input: &str) -> Ref {
if let Some(rest) = input.strip_prefix("doi:") {
Ref::Doi(Doi(rest.to_string()))
} else if let Some(rest) = input.strip_prefix("arxiv:") {
Ref::Arxiv(ArxivId(rest.to_string()))
} else {
panic!(
"vectors.json entry has unknown ref scheme (expected doi: or arxiv: prefix): {}",
input
);
}
}
#[test]
fn safekey_matches_reference_vectors() {
let raw = include_str!("../../../tests/fixtures/safekey/vectors.json");
let parsed: SafekeyVectorFile =
serde_json::from_str(raw).expect("vectors.json is valid JSON matching schema");
assert_eq!(
parsed.vectors.len(),
100,
"vectors.json MUST be exactly 100 entries (NORMATIVE per docs/SAFEKEY.md §5); got {}",
parsed.vectors.len()
);
let mut failures: Vec<String> = Vec::new();
for v in &parsed.vectors {
let r = ref_from_vector_input(&v.input);
let got = r.safekey().as_str().to_string();
if got != v.expected {
failures.push(format!(
"input={:?}\n expected={:?}\n got ={:?}",
v.input, v.expected, got
));
}
}
assert!(
failures.is_empty(),
"{}/{} safekey reference vectors failed:\n{}",
failures.len(),
parsed.vectors.len(),
failures.join("\n")
);
}
#[test]
fn safekey_truncates_long_inputs_with_sha256_suffix() {
let suffix = "a".repeat(220);
let doi = Doi(format!("10.1234/{}", suffix));
let key = Ref::Doi(doi).safekey();
let s = key.as_str();
assert_eq!(
s.len(),
201,
"expected 201-char truncated key, got {}: {}",
s.len(),
s
);
assert_eq!(&s[192..193], "_", "expected '_' separator at byte 192");
let hash_part = &s[193..];
assert_eq!(hash_part.len(), 8, "hash suffix must be 8 hex chars");
assert!(
hash_part
.chars()
.all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase()),
"hash suffix must be lowercase hex: {}",
hash_part
);
let key2 = Ref::Doi(Doi(format!("10.1234/{}", "a".repeat(220)))).safekey();
assert_eq!(s, key2.as_str(), "safekey must be deterministic");
use sha2::Digest;
let raw = format!("doi_10.1234/{}", "a".repeat(220));
let expected_hash = {
let digest = sha2::Sha256::digest(raw.as_bytes());
format!(
"{:02x}{:02x}{:02x}{:02x}",
digest[0], digest[1], digest[2], digest[3]
)
};
assert_eq!(
hash_part, expected_hash,
"hash must match SHA-256 of raw form"
);
}
#[test]
fn doi_parse_accepts_bare_canonical_form() {
let d = Doi::parse("10.1234/example").expect("canonical bare DOI");
assert_eq!(d.as_str(), "10.1234/example");
}
#[test]
fn doi_parse_accepts_doi_uri_scheme() {
let d = Doi::parse("doi:10.1234/example").expect("doi: scheme accepted");
assert_eq!(d.as_str(), "10.1234/example");
}
#[test]
fn doi_parse_accepts_complex_real_world_suffix() {
let d = Doi::parse("10.1103/PhysRevLett.130.200601").expect("real-world PhysRev DOI");
assert_eq!(d.as_str(), "10.1103/PhysRevLett.130.200601");
}
#[test]
fn doi_parse_accepts_parens_in_suffix() {
let d = Doi::parse("10.1016/S0370-1573(98)00122-3").expect("parens in suffix");
assert_eq!(d.as_str(), "10.1016/S0370-1573(98)00122-3");
}
#[test]
fn doi_parse_accepts_nested_slashes_in_suffix() {
let d = Doi::parse("10.1234/foo/bar/baz").expect("nested slashes");
assert_eq!(d.as_str(), "10.1234/foo/bar/baz");
}
#[test]
fn doi_parse_accepts_colon_in_legacy_kluwer_suffix() {
let d = Doi::parse("10.1023/A:1019601218492").expect("legacy Kluwer colon DOI");
assert_eq!(d.as_str(), "10.1023/A:1019601218492");
}
#[test]
fn doi_parse_accepts_colon_in_edp_jphys_suffix() {
let d = Doi::parse("10.1051/jphys:0198900500120136500").expect("EDP jphys colon DOI");
assert_eq!(d.as_str(), "10.1051/jphys:0198900500120136500");
let d2 = Doi::parse("doi:10.1051/jphys:0198500460100164500").expect("scheme + colon");
assert_eq!(d2.as_str(), "10.1051/jphys:0198500460100164500");
}
#[test]
fn doi_parse_rejects_semicolon_in_suffix() {
let result = Doi::parse("10.1234/foo;bar");
assert!(
matches!(result, Err(RefParseError::InvalidDoiSuffixChar { ch: ';' })),
"expected InvalidDoiSuffixChar with ch=';', got {:?}",
result
);
}
#[test]
fn doi_parse_accepts_suffix_at_max_len_boundary() {
let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN);
let input = format!("10.1234/{}", suffix);
let d = Doi::parse(&input).expect("suffix at max len");
assert_eq!(d.as_str().len(), "10.1234/".len() + DOI_SUFFIX_MAX_LEN);
}
#[test]
fn doi_parse_uri_scheme_is_case_insensitive() {
let d = Doi::parse("DOI:10.1234/example").expect("uppercase scheme");
assert_eq!(d.as_str(), "10.1234/example");
}
#[test]
fn doi_parse_rejects_missing_10_prefix() {
assert_eq!(
Doi::parse("11.1234/example"),
Err(RefParseError::MissingDoiPrefix)
);
}
#[test]
fn doi_parse_rejects_empty_input() {
assert_eq!(Doi::parse(""), Err(RefParseError::Empty));
}
#[test]
fn doi_parse_rejects_missing_suffix_separator() {
assert_eq!(
Doi::parse("10.1234"),
Err(RefParseError::MissingDoiSuffixSeparator)
);
}
#[test]
fn doi_parse_rejects_empty_suffix() {
assert_eq!(Doi::parse("10.1234/"), Err(RefParseError::EmptyDoiSuffix));
}
#[test]
fn doi_parse_rejects_invalid_registrant_too_short() {
assert_eq!(
Doi::parse("10.12/example"),
Err(RefParseError::InvalidDoiRegistrant)
);
}
#[test]
fn doi_parse_rejects_non_digit_registrant() {
assert_eq!(
Doi::parse("10.12ab/example"),
Err(RefParseError::InvalidDoiRegistrant)
);
}
#[test]
fn doi_parse_rejects_control_char_in_suffix() {
let result = Doi::parse("10.1234/foo\nbar");
assert!(
matches!(
result,
Err(RefParseError::InvalidDoiSuffixChar { ch: '\n' })
),
"got {:?}",
result
);
}
#[test]
fn doi_parse_rejects_suffix_over_max_len() {
let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 1);
let input = format!("10.1234/{}", suffix);
let result = Doi::parse(&input);
match result {
Err(RefParseError::DoiSuffixTooLong { len, max }) => {
assert_eq!(len, DOI_SUFFIX_MAX_LEN + 1);
assert_eq!(max, DOI_SUFFIX_MAX_LEN);
}
other => panic!("expected DoiSuffixTooLong, got {:?}", other),
}
}
#[test]
fn doi_parse_rejects_non_ascii_in_suffix() {
let result = Doi::parse("10.1234/物理学");
assert!(
matches!(result, Err(RefParseError::InvalidDoiSuffixChar { .. })),
"got {:?}",
result
);
}
#[test]
fn arxiv_parse_accepts_new_style_4_digit_seq() {
let a = ArxivId::parse("0704.0001").expect("new-style 4-digit seq");
assert_eq!(a.as_str(), "0704.0001");
}
#[test]
fn arxiv_parse_accepts_new_style_5_digit_seq() {
let a = ArxivId::parse("2401.12345").expect("new-style 5-digit seq");
assert_eq!(a.as_str(), "2401.12345");
}
#[test]
fn arxiv_parse_accepts_new_style_with_version() {
let a = ArxivId::parse("2401.12345v2").expect("with version");
assert_eq!(a.as_str(), "2401.12345v2");
}
#[test]
fn arxiv_parse_accepts_old_style() {
let a = ArxivId::parse("cond-mat/9501001").expect("old-style cond-mat");
assert_eq!(a.as_str(), "cond-mat/9501001");
}
#[test]
fn arxiv_parse_accepts_old_style_with_subclass_and_version() {
let a = ArxivId::parse("astro-ph.CO/0703123v2").expect("old-style with subclass + version");
assert_eq!(a.as_str(), "astro-ph.CO/0703123v2");
}
#[test]
fn arxiv_parse_accepts_arxiv_uri_scheme() {
let a = ArxivId::parse("arxiv:2401.12345").expect("arxiv: scheme");
assert_eq!(a.as_str(), "2401.12345");
}
#[test]
fn arxiv_parse_accepts_arxiv_uri_scheme_mixed_case() {
let a = ArxivId::parse("arXiv:2401.12345v2").expect("arXiv: scheme");
assert_eq!(a.as_str(), "2401.12345v2");
}
#[test]
fn arxiv_parse_rejects_empty_input() {
assert_eq!(ArxivId::parse(""), Err(RefParseError::Empty));
}
#[test]
fn arxiv_parse_rejects_no_dot_or_slash() {
assert_eq!(
ArxivId::parse("notanarxivid"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_new_style_wrong_head_length() {
assert_eq!(
ArxivId::parse("240.12345"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_new_style_seq_too_short() {
assert_eq!(
ArxivId::parse("2401.123"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_old_style_wrong_id_length() {
assert_eq!(
ArxivId::parse("cond-mat/95001"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_invalid_version_suffix() {
assert_eq!(
ArxivId::parse("2401.12345v"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_control_char() {
assert_eq!(
ArxivId::parse("2401.12345\n"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn arxiv_parse_rejects_non_ascii() {
assert_eq!(
ArxivId::parse("2401.物理"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn ref_parse_dispatches_doi_scheme_to_doi() {
match Ref::parse("doi:10.1234/example").expect("doi: dispatched to Doi") {
Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/example"),
other => panic!("expected Ref::Doi, got {:?}", other),
}
}
#[test]
fn ref_parse_dispatches_arxiv_scheme_to_arxiv() {
match Ref::parse("arxiv:2401.12345").expect("arxiv: dispatched to Arxiv") {
Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
other => panic!("expected Ref::Arxiv, got {:?}", other),
}
}
#[test]
fn ref_parse_dispatches_arxiv_mixed_case_scheme() {
match Ref::parse("arXiv:cond-mat/9501001").expect("arXiv: dispatched") {
Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
other => panic!("expected Ref::Arxiv, got {:?}", other),
}
}
#[test]
fn ref_parse_bare_doi_resolves_to_doi() {
match Ref::parse("10.1234/foo").expect("bare DOI") {
Ref::Doi(d) => assert_eq!(d.as_str(), "10.1234/foo"),
other => panic!("expected Ref::Doi, got {:?}", other),
}
}
#[test]
fn ref_parse_bare_arxiv_new_resolves_to_arxiv() {
match Ref::parse("2401.12345").expect("bare new-style arXiv") {
Ref::Arxiv(a) => assert_eq!(a.as_str(), "2401.12345"),
other => panic!("expected Ref::Arxiv, got {:?}", other),
}
}
#[test]
fn ref_parse_bare_arxiv_old_resolves_to_arxiv() {
match Ref::parse("cond-mat/9501001").expect("bare old-style arXiv") {
Ref::Arxiv(a) => assert_eq!(a.as_str(), "cond-mat/9501001"),
other => panic!("expected Ref::Arxiv, got {:?}", other),
}
}
#[test]
fn ref_parse_rejects_empty() {
assert_eq!(Ref::parse(""), Err(RefParseError::Empty));
}
#[test]
fn ref_parse_doi_scheme_with_invalid_doi_propagates_doi_error() {
assert_eq!(
Ref::parse("doi:10.1234"),
Err(RefParseError::MissingDoiSuffixSeparator)
);
}
#[test]
fn ref_parse_arxiv_scheme_with_invalid_arxiv_propagates_arxiv_error() {
assert_eq!(
Ref::parse("arxiv:notanid"),
Err(RefParseError::InvalidArxivShape)
);
}
#[test]
fn ref_parse_bare_with_10_prefix_uses_doi_errors() {
assert_eq!(
Ref::parse("10.12/x"),
Err(RefParseError::InvalidDoiRegistrant)
);
}
#[test]
fn ref_parse_bare_without_10_prefix_uses_arxiv_errors() {
assert_eq!(Ref::parse("1.2.3"), Err(RefParseError::InvalidArxivShape));
}
#[test]
fn ref_parse_rejects_doi_scheme_with_oversized_suffix() {
let suffix = "a".repeat(DOI_SUFFIX_MAX_LEN + 5);
let input = format!("doi:10.1234/{}", suffix);
match Ref::parse(&input) {
Err(RefParseError::DoiSuffixTooLong { .. }) => {}
other => panic!("expected DoiSuffixTooLong, got {:?}", other),
}
}
#[test]
fn ref_parse_round_trip_via_serde_preserves_inner_string() {
let r = Ref::parse("doi:10.1234/example").expect("parse ok");
let json = serde_json::to_string(&r).expect("serialize");
assert!(
json.contains("10.1234/example") && !json.contains("doi:"),
"scheme leaked into wire form: {}",
json
);
}
#[test]
fn ref_parse_error_maps_to_invalid_ref_error_code() {
let err: ErrorCode = RefParseError::Empty.into();
assert_eq!(err, ErrorCode::InvalidRef);
let err2: ErrorCode = RefParseError::MissingDoiPrefix.into();
assert_eq!(err2, ErrorCode::InvalidRef);
}
#[test]
fn denial_reason_serializes_snake_case() {
let s = serde_json::to_string(&DenialReason::RedirectNotInAllowlist).expect("ser");
assert_eq!(s, "\"redirect_not_in_allowlist\"");
let s = serde_json::to_string(&DenialReason::SizeCapExceeded).expect("ser");
assert_eq!(s, "\"size_cap_exceeded\"");
let s = serde_json::to_string(&DenialReason::ContentTypeMismatch).expect("ser");
assert_eq!(s, "\"content_type_mismatch\"");
}
#[test]
fn denial_reason_round_trip_via_serde() {
for r in [
DenialReason::RedirectNotInAllowlist,
DenialReason::InsecureScheme,
DenialReason::HostInBlockList,
DenialReason::SizeCapExceeded,
DenialReason::SchemaDrift,
DenialReason::CapabilityNotGranted,
DenialReason::RateLimitWindow,
DenialReason::SsrfPrivateAddress,
DenialReason::ContentTypeMismatch,
] {
let s = serde_json::to_string(&r).expect("ser");
let back: DenialReason = serde_json::from_str(&s).expect("de");
assert_eq!(back, r, "round-trip mismatch for {:?} -> {}", r, s);
}
}
#[test]
fn denial_context_round_trips_full_shape() {
let dc = DenialContext {
reason: DenialReason::RedirectNotInAllowlist,
source: Some("crossref".to_string()),
attempted: Some("evil.example.com".to_string()),
expected: Some(vec![
"api.crossref.org".to_string(),
"*.crossref.org".to_string(),
]),
hop_index: Some(1),
cap: None,
actual: None,
};
let s = serde_json::to_string(&dc).expect("ser");
let back: DenialContext = serde_json::from_str(&s).expect("de");
assert_eq!(back, dc);
}
#[test]
fn denial_context_serialize_elides_empty_fields() {
let dc = DenialContext {
reason: DenialReason::CapabilityNotGranted,
source: None,
attempted: None,
expected: None,
hop_index: None,
cap: None,
actual: None,
};
let s = serde_json::to_string(&dc).expect("ser");
assert_eq!(s, "{\"reason\":\"capability_not_granted\"}");
}
#[test]
fn denial_context_expected_some_empty_vec_preserves_explicit_empty_allowlist() {
let dc = DenialContext {
reason: DenialReason::RedirectNotInAllowlist,
source: Some("crossref".to_string()),
attempted: Some("evil.example.com".to_string()),
expected: Some(Vec::new()),
hop_index: None,
cap: None,
actual: None,
};
let s = serde_json::to_string(&dc).expect("ser");
assert!(
s.contains("\"expected\":[]"),
"expected:[] must survive on the wire (got: {s})"
);
let back: DenialContext = serde_json::from_str(&s).expect("de");
assert_eq!(back.expected, Some(Vec::new()));
}
#[test]
fn denial_context_deserialize_tolerates_missing_optional_fields() {
let wire = r#"{"reason":"size_cap_exceeded","cap":104857600,"actual":209715200}"#;
let dc: DenialContext = serde_json::from_str(wire).expect("de");
assert_eq!(dc.reason, DenialReason::SizeCapExceeded);
assert_eq!(dc.cap, Some(104857600));
assert_eq!(dc.actual, Some(209715200));
assert!(dc.source.is_none());
assert!(dc.attempted.is_none());
assert!(dc.expected.is_none());
assert!(dc.hop_index.is_none());
}
#[test]
fn full_error_envelope_with_denial_context_serializes_to_pinned_json() {
let denial = DenialContext {
reason: DenialReason::RedirectNotInAllowlist,
source: Some("crossref".into()),
attempted: Some("evil.example.com".into()),
expected: Some(vec!["api.crossref.org".into(), "*.crossref.org".into()]),
hop_index: Some(1),
cap: None,
actual: None,
};
let envelope = serde_json::json!({
"ok": false,
"error": {
"code": ErrorCode::NetworkError,
"message": "redirect target evil.example.com not in allowlist for source crossref",
"denial_context": denial,
}
});
let actual = serde_json::to_string(&envelope).expect("serialize envelope");
let expected = r#"{"error":{"code":"NETWORK_ERROR","denial_context":{"attempted":"evil.example.com","expected":["api.crossref.org","*.crossref.org"],"hop_index":1,"reason":"redirect_not_in_allowlist","source":"crossref"},"message":"redirect target evil.example.com not in allowlist for source crossref"},"ok":false}"#;
assert_eq!(actual, expected);
}
#[test]
fn denial_context_rejects_unknown_fields() {
let wire = r#"{"reason":"capability_not_granted","banana":1}"#;
let result: Result<DenialContext, _> = serde_json::from_str(wire);
assert!(
result.is_err(),
"deny_unknown_fields must reject 'banana': {:?}",
result.map(|d| d.reason),
);
}
}