use std::sync::Arc;
use anyhow::{anyhow, Context, Result};
use camino::Utf8PathBuf;
#[cfg(feature = "citation")]
use doiget_core::http::tier_2_allowlist;
use doiget_core::http::{oa_publisher_allowlist, tier_1_allowlist, HttpClient};
use doiget_core::orchestrator::{fetch_paper as core_fetch_paper, FetchPaperOutcome, PdfLegStatus};
use doiget_core::provenance::{Capability, LogEvent, LogResult, ProvenanceLog, RowInput};
use doiget_core::rate_limiter::RateLimiter;
use doiget_core::source::{FetchContext, FetchError};
use doiget_core::store::FsStore;
use doiget_core::{CapabilityProfile, DenialContext, DenialReason, ErrorCode, RateLimits, Ref};
fn new_session_id() -> String {
ulid::Ulid::new().to_string()
}
pub use doiget_core::dry_run::{
build_dry_run_envelope, build_fetch_plan, FetchPlan, PdfSourcePlan, RateLimitBudget,
};
#[allow(clippy::print_stdout)]
pub fn emit_dry_run_plan_to_stdout(ref_: &Ref, plan: &FetchPlan) -> Result<()> {
let envelope = build_dry_run_envelope(ref_, plan);
let s = serde_json::to_string(&envelope).context("serializing dry-run envelope to JSON")?;
println!("{s}");
Ok(())
}
fn resolve_log_path() -> Result<Utf8PathBuf> {
if let Some(s) = read_env_utf8("DOIGET_LOG_PATH")? {
return Ok(Utf8PathBuf::from(s));
}
let cfg = config_dir_utf8()?;
Ok(cfg.join("doiget").join("access.jsonl"))
}
fn read_env_utf8(key: &str) -> Result<Option<String>> {
match std::env::var(key) {
Ok(s) => Ok(Some(s)),
Err(std::env::VarError::NotPresent) => Ok(None),
Err(std::env::VarError::NotUnicode(_)) => Err(anyhow!("{key} is not valid UTF-8")),
}
}
fn home_dir_utf8() -> Result<Utf8PathBuf> {
if let Some(s) = read_env_utf8("HOME")? {
return Ok(Utf8PathBuf::from(s));
}
if let Some(s) = read_env_utf8("USERPROFILE")? {
return Ok(Utf8PathBuf::from(s));
}
Err(anyhow!("neither HOME nor USERPROFILE is set"))
}
pub(crate) fn config_dir_utf8() -> Result<Utf8PathBuf> {
if let Some(s) = read_env_utf8("XDG_CONFIG_HOME")? {
return Ok(Utf8PathBuf::from(s));
}
if let Some(s) = read_env_utf8("APPDATA")? {
return Ok(Utf8PathBuf::from(s));
}
let home = home_dir_utf8()?;
Ok(home.join(".config"))
}
fn build_http_client() -> Result<HttpClient> {
let arxiv = std::env::var("DOIGET_ARXIV_BASE").ok();
let crossref = std::env::var("DOIGET_CROSSREF_BASE").ok();
let unpaywall = std::env::var("DOIGET_UNPAYWALL_BASE").ok();
let oa_publisher = std::env::var("DOIGET_OA_PUBLISHER_BASE").ok();
let openalex_base = std::env::var("DOIGET_OPENALEX_BASE").ok();
if arxiv.is_none()
&& crossref.is_none()
&& unpaywall.is_none()
&& oa_publisher.is_none()
&& openalex_base.is_none()
{
let mut allowlists = tier_1_allowlist();
allowlists.extend(oa_publisher_allowlist());
#[cfg(feature = "citation")]
allowlists.extend(tier_2_allowlist());
match config_dir_utf8() {
Ok(cfg_dir) => {
let path = cfg_dir.join("doiget").join("config.toml");
match doiget_core::user_extension::load(&path) {
Ok(user_hosts) if !user_hosts.is_empty() => {
tracing::info!(
count = user_hosts.len(),
path = %path,
"merging user-extension allowlist hosts (ADR-0028 D2)"
);
doiget_core::user_extension::merge_into_allowlists(
&mut allowlists,
&user_hosts,
);
}
Ok(_) => {}
Err(e) => {
tracing::warn!(
error = %e,
path = %path,
"failed to load user-extension allowlist; \
falling back to curated set only"
);
}
}
}
Err(e) => {
tracing::debug!(
error = %e,
"config dir unresolvable; \
user-extension allowlist disabled (curated set only)"
);
}
}
return HttpClient::new(allowlists).context("building HTTP client");
}
let mut owned: Vec<(String, String)> = Vec::new();
for (source, base) in [
("arxiv", arxiv.as_deref()),
("crossref", crossref.as_deref()),
("unpaywall", unpaywall.as_deref()),
("oa-publisher", oa_publisher.as_deref()),
("openalex", openalex_base.as_deref()),
] {
if let Some(b) = base {
let url = url::Url::parse(b)
.with_context(|| format!("DOIGET_*_BASE for {source} is not a URL: {b}"))?;
let host = url
.host_str()
.ok_or_else(|| anyhow!("base URL has no host: {b}"))?;
owned.push((source.to_string(), host.to_string()));
}
}
let entries: Vec<(&str, &str)> = owned
.iter()
.map(|(s, h)| (s.as_str(), h.as_str()))
.collect();
Ok(HttpClient::new_for_tests_allow_http_multi(&entries))
}
#[allow(dead_code)]
pub(crate) struct OrchestratorConfig {
pub(crate) store_root: Utf8PathBuf,
pub(crate) log_path: Utf8PathBuf,
pub(crate) contact_email: String,
pub(crate) unpaywall_email: String,
}
impl OrchestratorConfig {
fn from_env() -> Result<Self> {
let store_root = super::resolve_store_root()?;
let log_path = resolve_log_path()?;
let contact_email =
std::env::var("DOIGET_CONTACT_EMAIL").unwrap_or_else(|_| "doiget@localhost".into());
let unpaywall_email =
std::env::var("DOIGET_UNPAYWALL_EMAIL").unwrap_or_else(|_| contact_email.clone());
Ok(Self {
store_root,
log_path,
contact_email,
unpaywall_email,
})
}
}
pub(crate) struct FetchHarness {
pub(crate) http: Arc<HttpClient>,
pub(crate) rate_limiter: Arc<RateLimiter>,
pub(crate) log: Arc<ProvenanceLog>,
pub(crate) store: FsStore,
pub(crate) profile: CapabilityProfile,
pub(crate) session_id: String,
#[allow(dead_code)]
pub(crate) cfg: OrchestratorConfig,
}
impl FetchHarness {
pub(crate) fn from_env() -> Result<Self> {
let cfg = OrchestratorConfig::from_env()?;
if let Some(parent) = cfg.log_path.parent() {
if !parent.as_str().is_empty() {
std::fs::create_dir_all(parent.as_std_path())
.with_context(|| format!("creating log dir {parent}"))?;
}
}
let session_id = new_session_id();
let log = Arc::new(
ProvenanceLog::open(cfg.log_path.clone(), session_id.clone())
.context("opening provenance log")?,
);
let http = Arc::new(build_http_client()?);
let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
let store = FsStore::new(cfg.store_root.clone()).context("opening store")?;
let profile = CapabilityProfile::from_env().context("resolving capability profile")?;
Ok(Self {
http,
rate_limiter,
log,
store,
profile,
session_id,
cfg,
})
}
pub(crate) fn fetch_context(&self) -> FetchContext {
FetchContext {
http: self.http.clone(),
rate_limiter: self.rate_limiter.clone(),
log: self.log.clone(),
session_id: self.session_id.clone(),
}
}
pub(crate) fn log_session_start(&self, ref_input: Option<&str>) -> Result<()> {
self.log
.append(RowInput {
event: LogEvent::SessionStart,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: ref_input,
source: None,
error_code: None,
size_bytes: None,
license: None,
store_path: None,
canonical_digest: None,
})
.context("appending SessionStart row")?;
Ok(())
}
pub(crate) fn log_session_end(&self, ok: bool, ref_input: Option<&str>) {
let result = if ok { LogResult::Ok } else { LogResult::Err };
let _ = self.log.append(RowInput {
event: LogEvent::SessionEnd,
result,
capability: Capability::Oa,
ref_: ref_input,
source: None,
error_code: None,
size_bytes: None,
license: None,
store_path: None,
canonical_digest: None,
});
}
pub(crate) async fn fetch_one(&self, ref_: &Ref) -> Result<FetchPaperOutcome, FetchError> {
let ctx = self.fetch_context();
core_fetch_paper(ref_, &self.profile, &ctx, &self.store, self.store.root()).await
}
}
pub(crate) fn outcome_is_clean_success(outcome: &FetchPaperOutcome) -> bool {
!matches!(outcome.pdf_leg, PdfLegStatus::Blocked { .. })
}
fn emit_success_line(ref_: &Ref, outcome: &FetchPaperOutcome) {
let label = match ref_ {
Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
};
match &outcome.pdf_leg {
PdfLegStatus::Fetched => {
print_success(format_args!(
"fetched {} ({} bytes) -> {}",
label, outcome.size_bytes, outcome.path
));
}
PdfLegStatus::NoOaUrl => {
print_success(format_args!(
"fetched {} (metadata-only: no OA PDF available) -> {}",
label, outcome.path
));
}
PdfLegStatus::Blocked {
code,
message,
denial,
} => {
let effective = effective_blocked_code(*code, denial.as_ref());
render_blocked_error(ref_, outcome, effective, message, denial.as_ref());
}
_ => {
if outcome.size_bytes == 0 {
print_success(format_args!(
"fetched {} (metadata-only) -> {}",
label, outcome.path
));
} else {
print_success(format_args!(
"fetched {} ({} bytes) -> {}",
label, outcome.size_bytes, outcome.path
));
}
}
}
}
pub async fn run_with_options(
input: String,
dry_run: bool,
_mode: super::output::OutputMode,
) -> Result<()> {
let ref_ = match Ref::parse(&input) {
Ok(r) => r,
Err(e) => {
print_err(format_args!(
"error[{}]: invalid ref: {e}",
ErrorCode::InvalidRef.as_wire()
));
return Err(anyhow::Error::new(CliExit(cli_exit_code(
ErrorCode::InvalidRef,
))));
}
};
if dry_run {
let store_root = super::resolve_store_root()?;
let plan = build_fetch_plan(&ref_, &store_root);
emit_dry_run_plan_to_stdout(&ref_, &plan)?;
return Ok(());
}
let harness = FetchHarness::from_env()?;
harness.log_session_start(Some(ref_.as_input_str()))?;
let result = harness.fetch_one(&ref_).await;
let session_ok = match &result {
Ok(o) => outcome_is_clean_success(o),
Err(_) => false,
};
harness.log_session_end(session_ok, Some(ref_.as_input_str()));
match result {
Ok(outcome) => {
if let PdfLegStatus::Blocked {
code,
message,
denial,
} = &outcome.pdf_leg
{
let effective = effective_blocked_code(*code, denial.as_ref());
render_blocked_error(&ref_, &outcome, effective, message, denial.as_ref());
return Err(anyhow::Error::new(CliExit(cli_exit_code(effective))));
}
emit_success_line(&ref_, &outcome);
Ok(())
}
Err(e) => {
render_fetch_error(&e);
let code: ErrorCode = (&e).into();
Err(anyhow::Error::new(CliExit(cli_exit_code(code))))
}
}
}
#[allow(clippy::print_stderr)]
fn print_success(args: std::fmt::Arguments<'_>) {
eprintln!("{args}");
}
#[allow(clippy::print_stderr)]
fn print_err(args: std::fmt::Arguments<'_>) {
eprintln!("{args}");
}
#[derive(Debug)]
pub struct CliExit(pub i32);
impl std::fmt::Display for CliExit {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "exiting with status {}", self.0)
}
}
impl std::error::Error for CliExit {}
pub(crate) fn effective_blocked_code(code: ErrorCode, denial: Option<&DenialContext>) -> ErrorCode {
match denial.map(|d| d.reason) {
Some(
DenialReason::RedirectNotInAllowlist
| DenialReason::InsecureScheme
| DenialReason::HostInBlockList,
) => ErrorCode::CapabilityDenied,
_ => code,
}
}
fn denial_reason_wire(reason: DenialReason) -> &'static str {
match reason {
DenialReason::RedirectNotInAllowlist => "redirect_not_in_allowlist",
DenialReason::InsecureScheme => "insecure_scheme",
DenialReason::HostInBlockList => "host_in_block_list",
_ => "policy_denied",
}
}
pub(crate) fn cli_exit_code(code: ErrorCode) -> i32 {
match code {
ErrorCode::CapabilityDenied => 3,
ErrorCode::StoreError | ErrorCode::LogError => 4,
ErrorCode::FetchTimeout => 124,
_ => 1,
}
}
fn render_fetch_error(e: &FetchError) {
let code: ErrorCode = e.into();
print_err(format_args!("error[{}]: {}", code.as_wire(), e));
if let Some(dc) = Option::<DenialContext>::from(e) {
let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
match &dc.expected {
Some(exp) if !exp.is_empty() => {
print_err(format_args!(
" = note: attempted {attempted}; allowed: {}",
exp.join(", ")
));
}
_ => {
print_err(format_args!(" = note: attempted {attempted}"));
}
}
}
}
fn render_blocked_error(
ref_: &Ref,
outcome: &FetchPaperOutcome,
code: ErrorCode,
message: &str,
denial: Option<&DenialContext>,
) {
let label = match ref_ {
Ref::Arxiv(id) => format!("arxiv:{}", id.as_str()),
Ref::Doi(doi) => format!("doi:{}", doi.as_str()),
};
match denial.map(|d| d.reason) {
Some(
reason @ (DenialReason::RedirectNotInAllowlist
| DenialReason::InsecureScheme
| DenialReason::HostInBlockList),
) => {
print_err(format_args!(
"error[{}]: {label}: an OA PDF was found but its host is blocked by \
supply-chain policy ({}): {message}",
code.as_wire(),
denial_reason_wire(reason)
));
}
_ => {
print_err(format_args!(
"error[{}]: {label}: an OA PDF was found but could not be retrieved: {message}",
code.as_wire()
));
}
}
if let Some(dc) = denial {
let attempted = dc.attempted.as_deref().unwrap_or("(unknown)");
match &dc.expected {
Some(exp) if !exp.is_empty() => {
print_err(format_args!(
" = note: attempted {attempted}; allowed: {}",
exp.join(", ")
));
}
_ => {
print_err(format_args!(" = note: attempted {attempted}"));
}
}
}
print_err(format_args!(
" = note: metadata-only record written to {}",
outcome.path
));
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use serial_test::serial;
#[test]
fn new_session_id_is_26_chars() {
let id = new_session_id();
assert_eq!(id.len(), 26, "session id must be 26 chars: {:?}", id);
assert!(
id.chars().all(|c| c.is_ascii_alphanumeric()),
"ulid must be ASCII alphanumeric: {:?}",
id
);
}
#[test]
#[serial]
fn build_http_client_merges_user_extension_into_oa_publisher_allowlist() {
use std::io::Write;
let td = tempfile::TempDir::new().expect("tempdir");
let cfg_dir = td.path().join("doiget");
std::fs::create_dir_all(&cfg_dir).expect("mkdir doiget/");
let cfg_path = cfg_dir.join("config.toml");
let mut f = std::fs::File::create(&cfg_path).expect("create config.toml");
f.write_all(
br#"
[[network.additional_hosts]]
host = "ruj.uj.edu.pl"
note = "Jagiellonian"
[[network.additional_hosts]]
host = "*.uj.edu.pl"
"#,
)
.expect("write config.toml");
drop(f);
struct EnvGuard {
key: &'static str,
prev: Option<String>,
}
impl EnvGuard {
fn save(key: &'static str) -> Self {
Self {
key,
prev: std::env::var(key).ok(),
}
}
}
impl Drop for EnvGuard {
fn drop(&mut self) {
match &self.prev {
Some(v) => std::env::set_var(self.key, v),
None => std::env::remove_var(self.key),
}
}
}
let _g0 = EnvGuard::save("XDG_CONFIG_HOME");
let _g1 = EnvGuard::save("APPDATA");
let _g2 = EnvGuard::save("HOME");
let _g3 = EnvGuard::save("USERPROFILE");
let _g4 = EnvGuard::save("DOIGET_ARXIV_BASE");
let _g5 = EnvGuard::save("DOIGET_CROSSREF_BASE");
let _g6 = EnvGuard::save("DOIGET_UNPAYWALL_BASE");
let _g7 = EnvGuard::save("DOIGET_OA_PUBLISHER_BASE");
let _g8 = EnvGuard::save("DOIGET_OPENALEX_BASE");
std::env::set_var("XDG_CONFIG_HOME", td.path());
std::env::set_var("APPDATA", td.path());
std::env::set_var("HOME", td.path());
std::env::set_var("USERPROFILE", td.path());
std::env::remove_var("DOIGET_ARXIV_BASE");
std::env::remove_var("DOIGET_CROSSREF_BASE");
std::env::remove_var("DOIGET_UNPAYWALL_BASE");
std::env::remove_var("DOIGET_OA_PUBLISHER_BASE");
std::env::remove_var("DOIGET_OPENALEX_BASE");
let client = build_http_client().expect("HttpClient builds");
let oa = client
.source_allowlist("oa-publisher")
.expect("oa-publisher source registered");
assert!(
oa.redirect_hosts.iter().any(|p| p == "*.aps.org"),
"curated *.aps.org MUST still be present after merge; got {:?}",
oa.redirect_hosts
);
assert!(
oa.matches("ruj.uj.edu.pl"),
"literal `ruj.uj.edu.pl` from user config MUST match"
);
assert!(
oa.matches("alpha.uj.edu.pl"),
"wildcard `*.uj.edu.pl` from user config MUST match alpha.uj.edu.pl"
);
assert!(
!oa.matches("ruj.uj.edu.ru"),
"host outside the suffix MUST NOT match"
);
}
#[test]
fn fetch_paper_outcome_is_reachable_from_cli() {
let _ = std::any::type_name::<doiget_core::orchestrator::FetchPaperOutcome>();
}
fn denial(reason: DenialReason) -> DenialContext {
DenialContext {
reason,
source: None,
attempted: None,
expected: None,
hop_index: None,
cap: None,
actual: None,
}
}
#[test]
fn policy_denials_reclassify_network_error_to_capability_denied() {
for r in [
DenialReason::RedirectNotInAllowlist,
DenialReason::InsecureScheme,
DenialReason::HostInBlockList,
] {
let d = denial(r);
assert_eq!(
effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
ErrorCode::CapabilityDenied,
"policy reason {r:?} must promote NetworkError -> CapabilityDenied"
);
assert_eq!(
cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, Some(&d))),
3,
"policy reason {r:?} must map to exit 3 (docs/ERRORS.md §4/§6.1)"
);
}
}
#[test]
fn absent_denial_context_keeps_network_error() {
assert_eq!(
effective_blocked_code(ErrorCode::NetworkError, None),
ErrorCode::NetworkError
);
assert_eq!(
cli_exit_code(effective_blocked_code(ErrorCode::NetworkError, None)),
1
);
}
#[test]
fn non_policy_denials_keep_core_code() {
for r in [
DenialReason::SizeCapExceeded,
DenialReason::ContentTypeMismatch,
] {
let d = denial(r);
assert_eq!(
effective_blocked_code(ErrorCode::NetworkError, Some(&d)),
ErrorCode::NetworkError,
"non-policy reason {r:?} must NOT be reclassified"
);
}
}
#[test]
fn denial_reason_wire_matches_serde_snake_case() {
for r in [
DenialReason::RedirectNotInAllowlist,
DenialReason::InsecureScheme,
DenialReason::HostInBlockList,
] {
let serde_form = serde_json::to_string(&r).expect("serialize DenialReason");
let serde_token = serde_form.trim_matches('"');
assert_eq!(
denial_reason_wire(r),
serde_token,
"CLI wire token for {r:?} must equal the serde snake_case form"
);
}
}
}