use camino::{Utf8Path, Utf8PathBuf};
use chrono::{DateTime, Duration, Utc};
use quick_xml::events::Event;
use quick_xml::Reader;
use serde::{Deserialize, Serialize};
use url::Url;
use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError};
use crate::{ArxivId, Ref};
const SOURCE_KEY: &str = "ar5iv";
pub const AR5IV_DEFAULT_BASE: &str = "https://ar5iv.labs.arxiv.org";
const TEXT_CACHE_TTL_DAYS: i64 = 30;
const TEXT_CACHE_SCHEMA_VERSION: &str = "1.0";
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
#[non_exhaustive]
pub enum TextSource {
Ar5iv,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct TextSection {
pub heading: Option<String>,
pub text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct PaperText {
pub arxiv_id: String,
pub source: TextSource,
pub title: Option<String>,
pub sections: Vec<TextSection>,
pub char_count: usize,
pub truncated: bool,
pub retrieved_from: String,
}
pub async fn paper_text(
base: &Url,
id: &ArxivId,
max_chars: Option<usize>,
ctx: &FetchContext,
) -> Result<PaperText, FetchError> {
if let Some(root) = &ctx.cache_root {
if let Some(full) = cache_read(root, id) {
return Ok(apply_max_chars(full, max_chars));
}
}
let full = fetch_and_parse(base, id, ctx).await?;
if let Some(root) = &ctx.cache_root {
cache_write(root, id, &full);
}
Ok(apply_max_chars(full, max_chars))
}
async fn fetch_and_parse(
base: &Url,
id: &ArxivId,
ctx: &FetchContext,
) -> Result<PaperText, FetchError> {
let _permit = ctx.rate_limiter.acquire(SOURCE_KEY).await;
let url = ar5iv_url(base, id)?;
let (body, final_url) = ctx.http.fetch_bytes(SOURCE_KEY, url).await?;
let (title, sections) = parse_ar5iv(&body)?;
if sections.is_empty() && title.is_none() {
return Err(FetchError::NotFound {
hint: format!(
"ar5iv returned no extractable text for {} (paper may not be converted to HTML)",
id.as_str()
),
});
}
let canonical = Ref::Arxiv(id.clone())
.promote(SOURCE_KEY, None)
.digest_hex();
ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: Some(id.as_str()),
source: Some(SOURCE_KEY),
error_code: None,
size_bytes: Some(body.len() as u64),
license: Some("arxiv-default"),
store_path: None,
canonical_digest: Some(&canonical),
})?;
let char_count = sections.iter().map(|s| s.text.chars().count()).sum();
Ok(PaperText {
arxiv_id: id.as_str().to_string(),
source: TextSource::Ar5iv,
title,
sections,
char_count,
truncated: false,
retrieved_from: final_url.to_string(),
})
}
fn ar5iv_url(base: &Url, id: &ArxivId) -> Result<Url, FetchError> {
base.join(&format!("/html/{}", id.as_str()))
.map_err(|e| FetchError::SourceSchema {
hint: format!("ar5iv URL construction failed: {e}"),
})
}
fn apply_max_chars(full: PaperText, max_chars: Option<usize>) -> PaperText {
let Some(max) = max_chars else {
return full;
};
let mut out: Vec<TextSection> = Vec::new();
let mut used = 0usize;
let mut truncated = false;
for sec in full.sections {
if used >= max {
truncated = true;
break;
}
let remaining = max - used;
let len = sec.text.chars().count();
if len <= remaining {
used += len;
out.push(sec);
} else {
let cut: String = sec.text.chars().take(remaining).collect();
used += remaining;
out.push(TextSection {
heading: sec.heading,
text: cut,
});
truncated = true;
break;
}
}
PaperText {
arxiv_id: full.arxiv_id,
source: full.source,
title: full.title,
sections: out,
char_count: used,
truncated,
retrieved_from: full.retrieved_from,
}
}
fn is_skip_element(local: &[u8]) -> bool {
matches!(local, b"script" | b"style" | b"math")
}
fn heading_level(local: &[u8]) -> Option<u8> {
match local {
b"h1" => Some(1),
b"h2" => Some(2),
b"h3" => Some(3),
b"h4" => Some(4),
b"h5" => Some(5),
b"h6" => Some(6),
_ => None,
}
}
fn extract_alttext(e: &quick_xml::events::BytesStart<'_>) -> Option<String> {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"alttext" {
if let Ok(v) = attr.normalized_value(quick_xml::XmlVersion::Explicit1_0) {
let s = v.into_owned();
if !s.trim().is_empty() {
return Some(s);
}
}
}
}
None
}
fn normalize(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn parse_ar5iv(html: &[u8]) -> Result<(Option<String>, Vec<TextSection>), FetchError> {
let mut reader = Reader::from_reader(html);
let config = reader.config_mut();
config.trim_text(true);
config.check_end_names = false;
let mut sections: Vec<TextSection> = Vec::new();
let mut cur_heading: Option<String> = None;
let mut cur_text = String::new();
let mut title: Option<String> = None;
let mut title_buf = String::new();
let mut in_title = false;
let mut skip: u32 = 0;
let mut in_heading: u8 = 0;
let mut heading_buf = String::new();
let mut buf: Vec<u8> = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name = e.name();
let local = local_name(name.as_ref());
if is_skip_element(local) {
if skip == 0 && local == b"math" {
if let Some(alt) = extract_alttext(&e) {
let frag = format!("\\({alt}\\) ");
push_target(
in_title,
in_heading,
&mut title_buf,
&mut heading_buf,
&mut cur_text,
&frag,
);
}
}
skip += 1;
} else if let Some(level) = heading_level(local) {
flush_section(&mut sections, &mut cur_heading, &mut cur_text);
in_heading = level;
heading_buf.clear();
} else if local == b"title" && title.is_none() {
in_title = true;
title_buf.clear();
}
buf.clear();
}
Ok(Event::Empty(e)) => {
let name = e.name();
let local = local_name(name.as_ref());
if skip == 0 && local == b"math" {
if let Some(alt) = extract_alttext(&e) {
let frag = format!("\\({alt}\\) ");
push_target(
in_title,
in_heading,
&mut title_buf,
&mut heading_buf,
&mut cur_text,
&frag,
);
}
}
buf.clear();
}
Ok(Event::Text(t)) => {
match t.decode().ok().and_then(|raw| {
quick_xml::escape::unescape(&raw)
.ok()
.map(|c| c.into_owned())
}) {
Some(s) => {
if !s.is_empty() && skip == 0 {
let mut frag = s;
frag.push(' ');
push_target(
in_title,
in_heading,
&mut title_buf,
&mut heading_buf,
&mut cur_text,
&frag,
);
}
}
None => {
tracing::debug!(
"ar5iv: skipped a text fragment that failed to decode/unescape"
)
}
}
buf.clear();
}
Ok(Event::End(e)) => {
let name = e.name();
let local = local_name(name.as_ref());
if is_skip_element(local) {
skip = skip.saturating_sub(1);
} else if heading_level(local).is_some() && in_heading > 0 {
cur_heading = {
let h = normalize(&heading_buf);
if h.is_empty() {
None
} else {
Some(h)
}
};
in_heading = 0;
cur_text.clear();
} else if local == b"title" && in_title {
in_title = false;
let t = normalize(&title_buf);
if !t.is_empty() {
title = Some(t);
}
}
buf.clear();
}
Ok(Event::Eof) => break,
Err(e) => {
tracing::debug!(error = %e, "ar5iv HTML parse error; returning best-effort partial text");
break;
}
_ => {
buf.clear();
}
}
}
flush_section(&mut sections, &mut cur_heading, &mut cur_text);
Ok((title, sections))
}
fn push_target(
in_title: bool,
in_heading: u8,
title_buf: &mut String,
heading_buf: &mut String,
cur_text: &mut String,
frag: &str,
) {
if in_title {
title_buf.push_str(frag);
} else if in_heading > 0 {
heading_buf.push_str(frag);
} else {
cur_text.push_str(frag);
}
}
fn flush_section(
sections: &mut Vec<TextSection>,
cur_heading: &mut Option<String>,
cur_text: &mut String,
) {
let text = normalize(cur_text);
if !text.is_empty() || cur_heading.is_some() {
sections.push(TextSection {
heading: cur_heading.clone(),
text,
});
}
cur_text.clear();
}
fn local_name(qname: &[u8]) -> &[u8] {
match qname.iter().rposition(|&b| b == b':') {
Some(idx) => &qname[idx + 1..],
None => qname,
}
}
#[derive(Debug, Serialize, Deserialize)]
struct TextCacheEntry {
schema_version: String,
fetched_at: String,
ttl_seconds: i64,
paper_text: PaperText,
}
fn cache_file(cache_root: &Utf8Path, id: &ArxivId) -> Utf8PathBuf {
let safekey = Ref::Arxiv(id.clone()).safekey();
cache_root
.join("text")
.join(format!("{}.json", safekey.as_str()))
}
fn cache_read(cache_root: &Utf8Path, id: &ArxivId) -> Option<PaperText> {
cache_read_at(cache_root, id, Utc::now())
}
fn cache_read_at(cache_root: &Utf8Path, id: &ArxivId, now: DateTime<Utc>) -> Option<PaperText> {
let path = cache_file(cache_root, id);
let text = std::fs::read_to_string(&path).ok()?;
let entry: TextCacheEntry = serde_json::from_str(&text).ok()?;
let fetched: DateTime<Utc> = DateTime::parse_from_rfc3339(&entry.fetched_at)
.ok()?
.with_timezone(&Utc);
if now > fetched + Duration::seconds(entry.ttl_seconds) {
return None;
}
Some(entry.paper_text)
}
fn cache_write(cache_root: &Utf8Path, id: &ArxivId, full: &PaperText) -> bool {
cache_write_at(cache_root, id, full, Utc::now())
}
fn cache_write_at(
cache_root: &Utf8Path,
id: &ArxivId,
full: &PaperText,
now: DateTime<Utc>,
) -> bool {
let entry = TextCacheEntry {
schema_version: TEXT_CACHE_SCHEMA_VERSION.to_string(),
fetched_at: now.to_rfc3339(),
ttl_seconds: TEXT_CACHE_TTL_DAYS * 86_400,
paper_text: full.clone(),
};
let json = match serde_json::to_string(&entry) {
Ok(s) => s,
Err(e) => {
tracing::debug!(error = %e, "text cache: serialize failed; skipping write");
return false;
}
};
let path = cache_file(cache_root, id);
if let Some(parent) = path.parent() {
if let Err(e) = std::fs::create_dir_all(parent) {
tracing::debug!(error = %e, dir = %parent, "text cache: mkdir failed; skipping write");
return false;
}
}
if let Err(e) = std::fs::write(&path, json) {
tracing::debug!(error = %e, path = %path, "text cache: write failed");
return false;
}
true
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
use std::sync::Arc;
use camino::Utf8PathBuf;
use tempfile::TempDir;
use wiremock::matchers::{method, path as path_matcher};
use wiremock::{Mock, MockServer, ResponseTemplate};
use crate::http::HttpClient;
use crate::provenance::{LogRow, ProvenanceLog};
use crate::rate_limiter::RateLimiter;
use crate::RateLimits;
const SAMPLE_AR5IV: &str = r#"<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Tropical Tensor Networks</title>
<style>.ltx_page { color: black; }</style>
</head>
<body>
<div class="ltx_page_content">
<p>We study tropical tensor networks for spin glasses.</p>
<section class="ltx_section">
<h2 class="ltx_title">1 Introduction</h2>
<p>The free energy is <math alttext="F = -kT \log Z"><mrow><mi>F</mi></mrow></math> in the limit.</p>
<script>trackingPixel();</script>
</section>
<section class="ltx_section">
<h2 class="ltx_title">2 Methods</h2>
<p>We use a contraction scheme.</p>
</section>
</div>
</body>
</html>"#;
fn build_test_context(
wiremock_host: &str,
cache: Option<Utf8PathBuf>,
) -> (TempDir, FetchContext) {
let td = TempDir::new().expect("tempdir");
let log_dir =
Utf8PathBuf::try_from(td.path().to_path_buf()).expect("temp dir path must be UTF-8");
let log_path = log_dir.join("test.jsonl");
let http = Arc::new(HttpClient::new_for_tests_allow_http("ar5iv", wiremock_host));
let rate_limiter = Arc::new(RateLimiter::new(RateLimits::HARD_CODED));
let session_id = "01J0000000000000000000TEST".to_string();
let log = Arc::new(
ProvenanceLog::open(log_path, session_id.clone()).expect("provenance log opens"),
);
let ctx = FetchContext {
http,
rate_limiter,
log,
session_id,
cache_root: cache,
};
(td, ctx)
}
fn read_rows(path: &camino::Utf8Path) -> Vec<LogRow> {
let raw = std::fs::read_to_string(path).expect("read log");
raw.lines()
.filter(|l| !l.is_empty())
.map(|l| serde_json::from_str::<LogRow>(l).expect("valid LogRow"))
.collect()
}
#[test]
fn parse_extracts_title_sections_and_inline_math() {
let (title, sections) = parse_ar5iv(SAMPLE_AR5IV.as_bytes()).expect("parses");
assert_eq!(title.as_deref(), Some("Tropical Tensor Networks"));
assert_eq!(
sections.len(),
3,
"lead + two headed sections: {sections:?}"
);
assert_eq!(sections[0].heading, None);
assert_eq!(
sections[0].text,
"We study tropical tensor networks for spin glasses."
);
assert_eq!(sections[1].heading.as_deref(), Some("1 Introduction"));
assert_eq!(
sections[1].text,
"The free energy is \\(F = -kT \\log Z\\) in the limit."
);
assert!(
!sections[1].text.contains("trackingPixel"),
"script content must be skipped: {}",
sections[1].text
);
assert_eq!(sections[2].heading.as_deref(), Some("2 Methods"));
assert_eq!(sections[2].text, "We use a contraction scheme.");
}
#[test]
fn parse_no_headings_yields_single_lead_section() {
let xml = r#"<html><body><p>One paragraph only.</p><p>Second one.</p></body></html>"#;
let (title, sections) = parse_ar5iv(xml.as_bytes()).expect("parses");
assert!(title.is_none());
assert_eq!(sections.len(), 1);
assert_eq!(sections[0].heading, None);
assert_eq!(sections[0].text, "One paragraph only. Second one.");
}
#[test]
fn parse_empty_body_yields_nothing() {
let xml = r#"<html><head></head><body></body></html>"#;
let (title, sections) = parse_ar5iv(xml.as_bytes()).expect("parses");
assert!(title.is_none());
assert!(sections.is_empty());
}
#[test]
fn parse_mismatched_tags_recovers_full_document() {
let xml = r#"<html><body><p>Alpha beta <b>bold</p><h2>Sec</h2><p>Body text</body>"#;
let res = parse_ar5iv(xml.as_bytes());
assert!(res.is_ok(), "best-effort parse must not error: {res:?}");
let (_title, sections) = res.expect("ok");
let joined: String = sections
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
joined.contains("Body text") && joined.contains("bold"),
"recovered text past the mismatched tags: {joined:?}"
);
}
#[test]
fn parse_hard_syntax_error_degrades_to_partial_not_error() {
let xml = r#"<html><body><p>Prefix kept here</p><p>Bad & entity halts</body>"#;
let res = parse_ar5iv(xml.as_bytes());
assert!(res.is_ok(), "hard syntax error must NOT error: {res:?}");
let (_title, sections) = res.expect("ok");
let joined: String = sections
.iter()
.map(|s| s.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
joined.contains("Prefix kept here"),
"the prefix before the hard error is retained: {joined:?}"
);
}
fn full_fixture() -> PaperText {
PaperText {
arxiv_id: "2401.12345".into(),
source: TextSource::Ar5iv,
title: Some("T".into()),
sections: vec![
TextSection {
heading: None,
text: "abcde".into(),
}, TextSection {
heading: Some("H".into()),
text: "fghij".into(),
}, ],
char_count: 10,
truncated: false,
retrieved_from: "https://ar5iv.labs.arxiv.org/html/2401.12345".into(),
}
}
#[test]
fn max_chars_none_returns_full() {
let out = apply_max_chars(full_fixture(), None);
assert!(!out.truncated);
assert_eq!(out.char_count, 10);
assert_eq!(out.sections.len(), 2);
}
#[test]
fn max_chars_above_total_is_untruncated() {
let out = apply_max_chars(full_fixture(), Some(100));
assert!(!out.truncated);
assert_eq!(out.char_count, 10);
}
#[test]
fn max_chars_cuts_within_a_section() {
let out = apply_max_chars(full_fixture(), Some(7));
assert!(out.truncated);
assert_eq!(out.char_count, 7);
assert_eq!(out.sections.len(), 2);
assert_eq!(out.sections[1].text, "fg");
assert_eq!(out.sections[1].heading.as_deref(), Some("H"));
}
#[test]
fn max_chars_drops_trailing_sections_on_exact_boundary() {
let out = apply_max_chars(full_fixture(), Some(5));
assert!(out.truncated);
assert_eq!(out.char_count, 5);
assert_eq!(out.sections.len(), 1);
}
#[test]
fn max_chars_zero_yields_no_text_but_flags_truncated() {
let out = apply_max_chars(full_fixture(), Some(0));
assert!(out.truncated);
assert_eq!(out.char_count, 0);
assert!(out.sections.is_empty());
}
#[test]
fn max_chars_truncation_is_char_boundary_safe_for_multibyte() {
let full = PaperText {
arxiv_id: "2401.12345".into(),
source: TextSource::Ar5iv,
title: None,
sections: vec![TextSection {
heading: None,
text: "あいうえお漢字".into(),
}],
char_count: 7,
truncated: false,
retrieved_from: "u".into(),
};
let out = apply_max_chars(full, Some(3));
assert!(out.truncated);
assert_eq!(out.char_count, 3);
assert_eq!(out.sections[0].text, "あいう");
}
#[test]
fn ar5iv_url_new_and_old_style() {
let base = Url::parse(AR5IV_DEFAULT_BASE).expect("base");
let new = ar5iv_url(&base, &ArxivId::parse("2401.12345").unwrap()).expect("url");
assert_eq!(new.path(), "/html/2401.12345");
assert_eq!(new.host_str(), Some("ar5iv.labs.arxiv.org"));
let old = ar5iv_url(&base, &ArxivId::parse("cond-mat/9501001").unwrap()).expect("url");
assert_eq!(old.path(), "/html/cond-mat/9501001");
}
#[test]
fn cache_write_then_read_round_trips() {
let dir = TempDir::new().unwrap();
let root = Utf8Path::from_path(dir.path()).unwrap();
let id = ArxivId::parse("2401.12345").unwrap();
let now = Utc::now();
assert!(cache_write_at(root, &id, &full_fixture(), now));
let got = cache_read_at(root, &id, now).expect("cache hit");
assert_eq!(got.arxiv_id, "2401.12345");
assert_eq!(got.sections.len(), 2);
assert!(!got.truncated, "cache stores the full, untruncated text");
}
#[test]
fn cache_miss_when_expired() {
let dir = TempDir::new().unwrap();
let root = Utf8Path::from_path(dir.path()).unwrap();
let id = ArxivId::parse("2401.12345").unwrap();
let written = Utc::now();
assert!(cache_write_at(root, &id, &full_fixture(), written));
let later = written + Duration::days(TEXT_CACHE_TTL_DAYS + 1);
assert!(cache_read_at(root, &id, later).is_none());
}
#[test]
fn cache_file_path_uses_text_dir_and_safekey() {
let root = Utf8Path::new("/tmp/cache");
let id = ArxivId::parse("2401.12345").unwrap();
let p = cache_file(root, &id);
assert!(p.components().any(|c| c.as_str() == "text"));
assert!(p.as_str().ends_with(".json"));
}
#[tokio::test]
async fn paper_text_fetches_parses_and_logs() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path_matcher("/html/2401.12345"))
.respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_AR5IV))
.mount(&server)
.await;
let host = server
.uri()
.parse::<Url>()
.unwrap()
.host_str()
.unwrap()
.to_string();
let (_td, ctx) = build_test_context(&host, None);
let log_path = ctx.log.path().to_path_buf();
let base = Url::parse(&server.uri()).expect("wiremock URI parses");
let id = ArxivId::parse("2401.12345").unwrap();
let out = paper_text(&base, &id, None, &ctx).await.expect("ok");
assert_eq!(out.arxiv_id, "2401.12345");
assert_eq!(out.source, TextSource::Ar5iv);
assert_eq!(out.title.as_deref(), Some("Tropical Tensor Networks"));
assert_eq!(out.sections.len(), 3);
assert!(!out.truncated);
let rows = read_rows(&log_path);
assert_eq!(rows.len(), 1, "one fetch row expected");
assert_eq!(rows[0].source.as_deref(), Some("ar5iv"));
assert_eq!(rows[0].ref_.as_deref(), Some("2401.12345"));
assert!(rows[0].error_code.is_none());
}
#[tokio::test]
async fn paper_text_truncates_when_max_chars_set() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path_matcher("/html/2401.12345"))
.respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_AR5IV))
.mount(&server)
.await;
let host = server
.uri()
.parse::<Url>()
.unwrap()
.host_str()
.unwrap()
.to_string();
let (_td, ctx) = build_test_context(&host, None);
let base = Url::parse(&server.uri()).expect("uri");
let id = ArxivId::parse("2401.12345").unwrap();
let out = paper_text(&base, &id, Some(10), &ctx).await.expect("ok");
assert!(out.truncated);
assert_eq!(out.char_count, 10);
}
#[tokio::test]
async fn paper_text_second_call_is_served_from_cache() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path_matcher("/html/2401.12345"))
.respond_with(ResponseTemplate::new(200).set_body_string(SAMPLE_AR5IV))
.up_to_n_times(1)
.mount(&server)
.await;
let host = server
.uri()
.parse::<Url>()
.unwrap()
.host_str()
.unwrap()
.to_string();
let cache_dir = TempDir::new().unwrap();
let cache_root =
Utf8PathBuf::try_from(cache_dir.path().to_path_buf()).expect("utf8 cache root");
let (_td, ctx) = build_test_context(&host, Some(cache_root));
let base = Url::parse(&server.uri()).expect("uri");
let id = ArxivId::parse("2401.12345").unwrap();
let first = paper_text(&base, &id, None, &ctx).await.expect("first ok");
assert_eq!(first.sections.len(), 3);
let second = paper_text(&base, &id, None, &ctx)
.await
.expect("second call served from cache");
assert_eq!(second.sections.len(), 3);
assert_eq!(second.title, first.title);
}
#[tokio::test]
async fn paper_text_empty_body_is_not_found() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path_matcher("/html/2401.99999"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string("<html><head></head><body></body></html>"),
)
.mount(&server)
.await;
let host = server
.uri()
.parse::<Url>()
.unwrap()
.host_str()
.unwrap()
.to_string();
let (_td, ctx) = build_test_context(&host, None);
let base = Url::parse(&server.uri()).expect("uri");
let id = ArxivId::parse("2401.99999").unwrap();
let err = paper_text(&base, &id, None, &ctx)
.await
.expect_err("empty body must be NotFound");
assert!(matches!(err, FetchError::NotFound { .. }), "got {err:?}");
}
#[tokio::test]
async fn paper_text_404_surfaces_http_error() {
let server = MockServer::start().await;
Mock::given(method("GET"))
.and(path_matcher("/html/2401.00000"))
.respond_with(ResponseTemplate::new(404))
.mount(&server)
.await;
let host = server
.uri()
.parse::<Url>()
.unwrap()
.host_str()
.unwrap()
.to_string();
let (_td, ctx) = build_test_context(&host, None);
let base = Url::parse(&server.uri()).expect("uri");
let id = ArxivId::parse("2401.00000").unwrap();
let err = paper_text(&base, &id, None, &ctx)
.await
.expect_err("404 must surface");
assert_eq!(crate::ErrorCode::from(&err), crate::ErrorCode::NotFound);
}
#[test]
fn text_source_serializes_lowercase() {
let s = serde_json::to_string(&TextSource::Ar5iv).expect("serialize");
assert_eq!(s, "\"ar5iv\"");
}
}