use camino::{Utf8Path, Utf8PathBuf};
use chrono::{DateTime, Duration, Utc};
use flate2::read::GzDecoder;
use serde::{Deserialize, Serialize};
use std::io::Read;
use tar::Archive;
use url::Url;
use crate::provenance::{Capability, LogEvent, LogResult, RowInput};
use crate::source::{FetchContext, FetchError};
use crate::{ArxivId, Ref};
const HTTP_SOURCE_KEY: &str = "arxiv";
const PROV_SOURCE_LABEL: &str = "arxiv-src";
const PROV_SOURCE_BUNDLE_LABEL: &str = "arxiv-src-bundle";
pub const ARXIV_SRC_DEFAULT_BASE: &str = "https://export.arxiv.org";
const TEX_SRC_CACHE_TTL_DAYS: i64 = 7;
const TEX_SRC_CACHE_SCHEMA_VERSION: &str = "1.0";
#[derive(Debug, Serialize, Deserialize)]
struct CacheEntry {
schema_version: String,
fetched_at: String,
ttl_seconds: i64,
inner: PaperTexSource,
}
#[derive(Debug)]
pub(crate) struct ExtractedTex {
pub main_file: Option<String>,
pub content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct PaperTexSource {
pub arxiv_id: String,
pub main_file: Option<String>,
pub tex_source: String,
pub char_count: usize,
pub truncated: bool,
pub retrieved_from: String,
}
pub async fn paper_tex_source(
base: &Url,
id: &ArxivId,
max_chars: Option<usize>,
ctx: &FetchContext,
) -> Result<PaperTexSource, FetchError> {
if let Some(root) = &ctx.cache_root {
if let Some(full) = cache_read(root, id) {
return Ok(apply_max_chars(full, max_chars));
}
}
let full = fetch_and_extract(base, id, ctx).await?;
if let Some(root) = &ctx.cache_root {
if !cache_write(root, id, &full) {
tracing::warn!(
cache_root = %root,
arxiv_id = %id.as_str(),
"tex-source cache write failed; next request will re-fetch"
);
}
}
Ok(apply_max_chars(full, max_chars))
}
async fn fetch_and_extract(
base: &Url,
id: &ArxivId,
ctx: &FetchContext,
) -> Result<PaperTexSource, FetchError> {
let _permit = ctx.rate_limiter.acquire(HTTP_SOURCE_KEY).await;
let url = src_url(base, id)?;
let (body, final_url) = ctx.http.fetch_bytes(HTTP_SOURCE_KEY, url).await?;
let extracted = extract_tex(id, &body)?;
let char_count = extracted.content.chars().count();
let canonical = Ref::Arxiv(id.clone())
.promote(PROV_SOURCE_LABEL, None)
.digest_hex();
ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: Some(id.as_str()),
source: Some(PROV_SOURCE_LABEL),
error_code: None,
size_bytes: Some(body.len() as u64),
license: Some("arxiv-default"),
store_path: None,
canonical_digest: Some(&canonical),
})?;
Ok(PaperTexSource {
arxiv_id: id.as_str().to_string(),
main_file: extracted.main_file,
tex_source: extracted.content,
char_count,
truncated: false,
retrieved_from: final_url.to_string(),
})
}
fn src_url(base: &Url, id: &ArxivId) -> Result<Url, FetchError> {
base.join(&format!("/src/{}", id.as_str()))
.map_err(|e| FetchError::SourceSchema {
hint: format!("arXiv src URL construction failed: {e}"),
})
}
#[derive(Debug)]
enum SrcPayload {
PdfOnly,
SingleFile(Vec<u8>),
Tar(Vec<u8>),
}
fn classify_src(bytes: &[u8], max_decompressed: Option<u64>) -> Result<SrcPayload, FetchError> {
if bytes.starts_with(b"%PDF-") {
return Ok(SrcPayload::PdfOnly);
}
if bytes.len() < 2 || bytes[0..2] != [0x1f, 0x8b] {
return Ok(SrcPayload::SingleFile(bytes.to_vec()));
}
let mut decompressed = Vec::new();
match max_decompressed {
Some(cap) => {
let mut gz = GzDecoder::new(std::io::Cursor::new(bytes)).take(cap + 1);
gz.read_to_end(&mut decompressed)
.map_err(|e| FetchError::SourceSchema {
hint: format!("gzip decompress of arXiv src failed: {e}"),
})?;
if decompressed.len() as u64 > cap {
return Err(FetchError::SourceSchema {
hint: format!(
"arXiv src decompressed size exceeds {cap} bytes \
(possible gzip bomb); refusing"
),
});
}
}
None => {
let mut gz = GzDecoder::new(std::io::Cursor::new(bytes));
gz.read_to_end(&mut decompressed)
.map_err(|e| FetchError::SourceSchema {
hint: format!("gzip decompress of arXiv src failed: {e}"),
})?;
}
}
let is_tar = decompressed.len() > 262 && &decompressed[257..262] == b"ustar";
if is_tar {
Ok(SrcPayload::Tar(decompressed))
} else {
Ok(SrcPayload::SingleFile(decompressed))
}
}
pub(crate) fn extract_tex(id: &ArxivId, bytes: &[u8]) -> Result<ExtractedTex, FetchError> {
match classify_src(bytes, Some(SRC_MAX_DECOMPRESSED_BYTES))? {
SrcPayload::PdfOnly => Err(FetchError::TextUnavailable {
arxiv_id: id.clone(),
}),
SrcPayload::SingleFile(data) => {
let text = String::from_utf8_lossy(&data).into_owned();
if text.trim().is_empty() {
return Err(FetchError::TextUnavailable {
arxiv_id: id.clone(),
});
}
Ok(ExtractedTex {
main_file: None,
content: text,
})
}
SrcPayload::Tar(decompressed) => extract_from_tar(id, &decompressed),
}
}
fn extract_from_tar(id: &ArxivId, bytes: &[u8]) -> Result<ExtractedTex, FetchError> {
let mut archive = Archive::new(std::io::Cursor::new(bytes));
let entries = archive.entries().map_err(|e| FetchError::SourceSchema {
hint: format!("tar read failed: {e}"),
})?;
let mut tex_files: Vec<(String, String)> = Vec::new();
let mut tex_attempted: usize = 0;
let mut unreadable: usize = 0;
for entry in entries {
let Ok(mut entry) = entry else {
unreadable += 1;
continue;
};
let raw = match entry.path() {
Ok(p) => p.to_string_lossy().to_string(),
Err(_) => {
unreadable += 1;
continue;
}
};
let Some(path) = sanitize_entry_path(&raw).map(|p| p.to_string()) else {
tracing::warn!(arxiv_id = %id.as_str(), entry = %raw, "skipping unsafe arXiv src entry path");
continue;
};
if !path.ends_with(".tex") {
continue;
}
tex_attempted += 1;
let mut content = String::new();
match entry.read_to_string(&mut content) {
Ok(_) if !content.trim().is_empty() => tex_files.push((path, content)),
Ok(_) => {} Err(_) => unreadable += 1,
}
}
if unreadable > 0 {
tracing::warn!(
arxiv_id = %id.as_str(),
unreadable,
"some arXiv src tar entries were unreadable/unsafe and were skipped"
);
}
if tex_files.is_empty() {
return Err(if tex_attempted > 0 {
FetchError::SourceSchema {
hint: format!("tar contained {tex_attempted} .tex entries but all failed to read"),
}
} else {
FetchError::TextUnavailable {
arxiv_id: id.clone(),
}
});
}
let best = tex_files.into_iter().max_by_key(|(name, content)| {
let docclass = i64::from(content.contains(r"\documentclass")) * 1_000_000;
let is_main = i64::from(name.ends_with("main.tex") || name == "main.tex") * 100_000;
let size = i64::try_from(content.len()).unwrap_or(i64::MAX);
docclass.saturating_add(is_main).saturating_add(size)
});
match best {
Some((name, content)) => Ok(ExtractedTex {
main_file: Some(name),
content,
}),
None => Err(FetchError::TextUnavailable {
arxiv_id: id.clone(),
}),
}
}
fn apply_max_chars(mut full: PaperTexSource, max_chars: Option<usize>) -> PaperTexSource {
let Some(max) = max_chars else {
return full;
};
if full.char_count <= max {
return full;
}
full.tex_source = full.tex_source.chars().take(max).collect();
full.char_count = max;
full.truncated = true;
full
}
fn cache_file(cache_root: &Utf8Path, id: &ArxivId) -> Utf8PathBuf {
let safekey = Ref::Arxiv(id.clone()).safekey();
cache_root
.join("tex-src")
.join(format!("{}.json", safekey.as_str()))
}
fn cache_read(cache_root: &Utf8Path, id: &ArxivId) -> Option<PaperTexSource> {
cache_read_at(cache_root, id, Utc::now())
}
fn cache_read_at(
cache_root: &Utf8Path,
id: &ArxivId,
now: DateTime<Utc>,
) -> Option<PaperTexSource> {
let path = cache_file(cache_root, id);
let bytes = std::fs::read(&path).ok()?;
let entry: CacheEntry = serde_json::from_slice(&bytes).ok()?;
if entry.schema_version != TEX_SRC_CACHE_SCHEMA_VERSION {
return None;
}
let fetched = DateTime::parse_from_rfc3339(&entry.fetched_at)
.ok()?
.with_timezone(&Utc);
if now.signed_duration_since(fetched) > Duration::seconds(entry.ttl_seconds) {
return None;
}
Some(entry.inner)
}
fn cache_write(cache_root: &Utf8Path, id: &ArxivId, full: &PaperTexSource) -> bool {
cache_write_at(cache_root, id, full, Utc::now())
}
fn cache_write_at(
cache_root: &Utf8Path,
id: &ArxivId,
full: &PaperTexSource,
now: DateTime<Utc>,
) -> bool {
let path = cache_file(cache_root, id);
if let Some(dir) = path.parent() {
if std::fs::create_dir_all(dir).is_err() {
return false;
}
}
let entry = CacheEntry {
schema_version: TEX_SRC_CACHE_SCHEMA_VERSION.to_string(),
fetched_at: now.to_rfc3339(),
ttl_seconds: TEX_SRC_CACHE_TTL_DAYS * 86_400,
inner: full.clone(),
};
match serde_json::to_vec(&entry) {
Ok(bytes) => std::fs::write(&path, bytes).is_ok(),
Err(_) => false,
}
}
pub fn resolve_arxiv_src_base() -> Result<Url, String> {
let raw = std::env::var("DOIGET_ARXIV_SRC_BASE")
.unwrap_or_else(|_| ARXIV_SRC_DEFAULT_BASE.to_string());
Url::parse(&raw).map_err(|e| format!("DOIGET_ARXIV_SRC_BASE is not a valid URL: {e}"))
}
const FIGURE_EXTS: &[&str] = &["pdf", "eps", "ps", "png", "jpg", "jpeg", "gif", "svg"];
const SRC_MAX_DECOMPRESSED_BYTES: u64 = 500_000_000;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum BundleFilter {
All,
FiguresOnly,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct SourceFile {
pub(crate) path: Utf8PathBuf,
pub bytes: Vec<u8>,
}
impl SourceFile {
#[must_use]
pub fn path(&self) -> &Utf8Path {
&self.path
}
}
fn sanitize_entry_path(raw: &str) -> Option<Utf8PathBuf> {
if raw.is_empty() || raw.contains('\0') {
return None;
}
if raw.starts_with('/') || raw.starts_with('\\') {
return None;
}
let b = raw.as_bytes();
if b.len() >= 2 && b[0].is_ascii_alphabetic() && b[1] == b':' {
return None;
}
let mut out = Utf8PathBuf::new();
let mut any = false;
for seg in raw.split(['/', '\\']) {
match seg {
"" | "." => continue, ".." => return None, s => {
if s.contains(':') || s.contains('\0') {
return None;
}
out.push(s);
any = true;
}
}
}
if any {
Some(out)
} else {
None
}
}
fn is_figure(path: &Utf8Path) -> bool {
match path.extension() {
Some(ext) => FIGURE_EXTS.contains(&ext.to_ascii_lowercase().as_str()),
None => false,
}
}
pub(crate) fn extract_bundle(
id: &ArxivId,
bytes: &[u8],
filter: BundleFilter,
) -> Result<Vec<SourceFile>, FetchError> {
let decompressed = match classify_src(bytes, Some(SRC_MAX_DECOMPRESSED_BYTES))? {
SrcPayload::Tar(d) => d,
SrcPayload::PdfOnly | SrcPayload::SingleFile(_) => return Err(no_files(id, filter)),
};
let mut archive = Archive::new(std::io::Cursor::new(decompressed));
let entries = archive.entries().map_err(|e| FetchError::SourceSchema {
hint: format!("tar read failed: {e}"),
})?;
let mut files: Vec<SourceFile> = Vec::new();
let mut unreadable: usize = 0;
for entry in entries {
let mut entry = match entry {
Ok(e) => e,
Err(e) => {
unreadable += 1;
tracing::warn!(arxiv_id = %id.as_str(), error = %e, "arXiv src: skipping malformed tar entry");
continue;
}
};
if !entry.header().entry_type().is_file() {
continue;
}
let raw_path = match entry.path() {
Ok(p) => p.to_string_lossy().into_owned(),
Err(e) => {
unreadable += 1;
tracing::warn!(arxiv_id = %id.as_str(), error = %e, "arXiv src: tar entry has a non-decodable path; skipping");
continue;
}
};
let Some(safe) = sanitize_entry_path(&raw_path) else {
tracing::warn!(
entry = %raw_path,
"arXiv src: rejected unsafe tar entry path (zip-slip guard)"
);
continue;
};
if filter == BundleFilter::FiguresOnly && !is_figure(&safe) {
continue;
}
let mut buf = Vec::new();
match entry.read_to_end(&mut buf) {
Ok(_) => files.push(SourceFile {
path: safe,
bytes: buf,
}),
Err(e) => {
unreadable += 1;
tracing::warn!(arxiv_id = %id.as_str(), entry = %safe, error = %e, "arXiv src: failed to read tar entry; skipping");
}
}
}
if files.is_empty() {
return Err(if unreadable > 0 {
FetchError::SourceSchema {
hint: format!(
"arXiv src tar had {unreadable} unreadable entr(y/ies) and no usable files"
),
}
} else {
no_files(id, filter)
});
}
if unreadable > 0 {
tracing::warn!(
arxiv_id = %id.as_str(),
unreadable,
extracted = files.len(),
"arXiv src: bundle is partial — some entries were unreadable and skipped"
);
}
Ok(files)
}
fn no_files(id: &ArxivId, filter: BundleFilter) -> FetchError {
FetchError::SourceUnavailable {
arxiv_id: id.clone(),
kind: match filter {
BundleFilter::All => "source bundle",
BundleFilter::FiguresOnly => "figures",
},
}
}
pub async fn paper_source_bundle(
base: &Url,
id: &ArxivId,
filter: BundleFilter,
ctx: &FetchContext,
) -> Result<Vec<SourceFile>, FetchError> {
let _permit = ctx.rate_limiter.acquire(HTTP_SOURCE_KEY).await;
let url = src_url(base, id)?;
let (body, _final_url) = ctx.http.fetch_bytes(HTTP_SOURCE_KEY, url).await?;
let files = extract_bundle(id, &body, filter)?;
let canonical = Ref::Arxiv(id.clone())
.promote(PROV_SOURCE_BUNDLE_LABEL, None)
.digest_hex();
ctx.log.append(RowInput {
event: LogEvent::Fetch,
result: LogResult::Ok,
capability: Capability::Oa,
ref_: Some(id.as_str()),
source: Some(PROV_SOURCE_BUNDLE_LABEL),
error_code: None,
size_bytes: Some(body.len() as u64),
license: Some("arxiv-default"),
store_path: None,
canonical_digest: Some(&canonical),
})?;
Ok(files)
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic, missing_docs)]
mod tests {
use super::*;
use flate2::write::GzEncoder;
use flate2::Compression;
use std::io::Write as _;
fn make_id(s: &str) -> ArxivId {
match Ref::parse(s).expect("parse") {
Ref::Arxiv(a) => a,
_ => panic!("expected arxiv id"),
}
}
fn gzip_bytes(data: &[u8]) -> Vec<u8> {
let mut enc = GzEncoder::new(Vec::new(), Compression::default());
enc.write_all(data).expect("gzip write");
enc.finish().expect("gzip finish")
}
fn tar_gzip(files: &[(&str, &[u8])]) -> Vec<u8> {
let mut builder = tar::Builder::new(Vec::new());
for (name, data) in files {
let mut header = tar::Header::new_gnu();
header.set_size(data.len() as u64);
header.set_mode(0o644);
header.set_cksum();
builder
.append_data(&mut header, name, std::io::Cursor::new(data))
.expect("tar append");
}
gzip_bytes(&builder.into_inner().expect("tar finish"))
}
fn make_src(id: &ArxivId) -> PaperTexSource {
PaperTexSource {
arxiv_id: id.as_str().to_string(),
main_file: Some("main.tex".into()),
tex_source: "\\documentclass{article}".into(),
char_count: 23,
truncated: false,
retrieved_from: "https://export.arxiv.org/src/2401.12345".into(),
}
}
#[test]
fn apply_max_chars_no_cap_is_identity() {
let id = make_id("2401.12345");
let src = make_src(&id);
let out = apply_max_chars(src.clone(), None);
assert_eq!(out, src);
}
#[test]
fn apply_max_chars_truncates() {
let id = make_id("2401.12345");
let src = PaperTexSource {
arxiv_id: id.as_str().to_string(),
main_file: None,
tex_source: "abcdefghij".into(),
char_count: 10,
truncated: false,
retrieved_from: "https://export.arxiv.org/src/2401.12345".into(),
};
let out = apply_max_chars(src, Some(4));
assert_eq!(out.tex_source, "abcd");
assert_eq!(out.char_count, 4);
assert!(out.truncated);
}
#[test]
fn pdf_only_yields_text_unavailable() {
let id = make_id("2401.12345");
let result = extract_tex(&id, b"%PDF-1.4 fake");
assert!(matches!(result, Err(FetchError::TextUnavailable { .. })));
}
#[test]
fn raw_tex_passthrough() {
let id = make_id("2401.12345");
let tex = b"\\documentclass{article}\n\\begin{document}\nHello.\\end{document}";
let ext = extract_tex(&id, tex).expect("extract");
assert!(ext.main_file.is_none());
assert!(ext.content.contains("\\documentclass"));
}
#[test]
fn gzip_single_file_extracted() {
let id = make_id("2401.12345");
let tex = b"\\documentclass{article}\n\\begin{document}Hello\\end{document}";
let gz = gzip_bytes(tex);
let ext = extract_tex(&id, &gz).expect("extract");
assert!(ext.main_file.is_none(), "single gzip has no tar filename");
assert!(ext.content.contains("\\documentclass"));
}
#[test]
fn classify_src_rejects_decompression_over_cap() {
let big = vec![b'x'; 10_000];
let gz = gzip_bytes(&big);
let err = classify_src(&gz, Some(1_000)).expect_err("over-cap must be rejected");
assert!(
matches!(err, FetchError::SourceSchema { .. }),
"got {err:?}"
);
}
#[test]
fn classify_src_accepts_decompression_within_cap() {
let small = vec![b'x'; 500];
let gz = gzip_bytes(&small);
let payload = classify_src(&gz, Some(1_000)).expect("within cap");
assert!(matches!(payload, SrcPayload::SingleFile(_)));
}
#[test]
fn tar_selects_documentclass_file_over_plain() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[
("paper.tex", b"\\documentclass{article} main content"),
("macros.tex", b"\\newcommand{\\foo}{bar}"),
]);
let ext = extract_tex(&id, &payload).expect("extract");
assert_eq!(ext.main_file.as_deref(), Some("paper.tex"));
assert!(ext.content.contains("\\documentclass"));
}
#[test]
fn tar_prefers_main_tex_among_documentclass_files() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[
("other.tex", b"\\documentclass{article} other content here"),
("main.tex", b"\\documentclass{article} main"),
]);
let ext = extract_tex(&id, &payload).expect("extract");
assert_eq!(
ext.main_file.as_deref(),
Some("main.tex"),
"main.tex bonus must override smaller-but-also-documentclass other.tex"
);
}
#[test]
fn tar_falls_back_to_largest_file_when_no_documentclass() {
let id = make_id("2401.12345");
let short = b"\\section{Short}".as_slice();
let mut long_content = b"\\section{Long} ".to_vec();
long_content.extend(vec![b'x'; 500]);
let payload = tar_gzip(&[("short.tex", short), ("long.tex", &long_content)]);
let ext = extract_tex(&id, &payload).expect("extract");
assert_eq!(ext.main_file.as_deref(), Some("long.tex"));
}
#[test]
fn tar_with_no_tex_files_is_text_unavailable() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[("README.md", b"# Paper"), ("figure.eps", b"%!PS")]);
let err = extract_tex(&id, &payload).expect_err("should fail");
assert!(matches!(err, FetchError::TextUnavailable { .. }));
}
#[test]
fn resolve_base_defaults_to_production() {
if std::env::var("DOIGET_ARXIV_SRC_BASE").is_err() {
let u = resolve_arxiv_src_base().expect("resolve");
assert_eq!(u.as_str(), "https://export.arxiv.org/");
}
}
#[test]
fn cache_round_trip() {
let dir = tempfile::tempdir().expect("tempdir");
let root = camino::Utf8PathBuf::from_path_buf(dir.path().to_path_buf()).expect("utf8");
let id = make_id("2401.12345");
let src = make_src(&id);
assert!(cache_write(&root, &id, &src));
let read = cache_read(&root, &id).expect("cache hit");
assert_eq!(read, src);
}
#[test]
fn cache_expired_returns_none() {
let dir = tempfile::tempdir().expect("tempdir");
let root = camino::Utf8PathBuf::from_path_buf(dir.path().to_path_buf()).expect("utf8");
let id = make_id("2401.12345");
let src = PaperTexSource {
arxiv_id: id.as_str().to_string(),
main_file: None,
tex_source: "test".into(),
char_count: 4,
truncated: false,
retrieved_from: "https://export.arxiv.org/src/2401.12345".into(),
};
let past = Utc::now() - Duration::days(TEX_SRC_CACHE_TTL_DAYS + 1);
assert!(cache_write_at(&root, &id, &src, past));
assert!(cache_read_at(&root, &id, Utc::now()).is_none());
}
#[test]
fn cache_schema_version_mismatch_returns_none() {
let dir = tempfile::tempdir().expect("tempdir");
let root = camino::Utf8PathBuf::from_path_buf(dir.path().to_path_buf()).expect("utf8");
let id = make_id("2401.12345");
let src = make_src(&id);
let bad = serde_json::json!({
"schema_version": "0.9",
"fetched_at": Utc::now().to_rfc3339(),
"ttl_seconds": 86_400 * 7i64,
"inner": src,
});
let path = cache_file(&root, &id);
std::fs::create_dir_all(path.parent().expect("parent")).expect("mkdir");
std::fs::write(&path, serde_json::to_vec(&bad).expect("json")).expect("write");
assert!(
cache_read_at(&root, &id, Utc::now()).is_none(),
"stale schema version must be rejected"
);
}
#[test]
fn sanitize_accepts_normal_relative_paths() {
assert_eq!(
sanitize_entry_path("main.tex").map(|p| p.as_str().replace('\\', "/")),
Some("main.tex".to_string())
);
assert_eq!(
sanitize_entry_path("figs/diagram.png").map(|p| p.as_str().replace('\\', "/")),
Some("figs/diagram.png".to_string())
);
assert_eq!(
sanitize_entry_path("./a//b.tex").map(|p| p.as_str().replace('\\', "/")),
Some("a/b.tex".to_string())
);
}
#[test]
fn sanitize_rejects_parent_traversal() {
assert_eq!(sanitize_entry_path("../evil.tex"), None);
assert_eq!(sanitize_entry_path("a/../../etc/passwd"), None);
assert_eq!(sanitize_entry_path("sub/../x"), None);
}
#[test]
fn sanitize_rejects_absolute_and_anchored() {
assert_eq!(sanitize_entry_path("/etc/passwd"), None);
assert_eq!(sanitize_entry_path("\\windows\\system32"), None);
assert_eq!(sanitize_entry_path("C:\\Windows\\evil"), None);
assert_eq!(sanitize_entry_path("C:/Windows/evil"), None);
}
#[test]
fn sanitize_rejects_backslash_traversal_cross_platform() {
assert_eq!(sanitize_entry_path("..\\..\\evil"), None);
assert_eq!(sanitize_entry_path("a\\..\\..\\b"), None);
}
#[test]
fn sanitize_rejects_empty_nul_dot_and_colon() {
assert_eq!(sanitize_entry_path(""), None);
assert_eq!(sanitize_entry_path("a/\0/b"), None);
assert_eq!(sanitize_entry_path("."), None); assert_eq!(sanitize_entry_path("a:b/c"), None); assert_eq!(sanitize_entry_path("foo/../../bar"), None); assert_eq!(sanitize_entry_path("./.."), None); assert_eq!(sanitize_entry_path("///"), None); assert_eq!(sanitize_entry_path("\\\\"), None); assert_eq!(sanitize_entry_path("C:evil"), None); }
#[test]
fn is_figure_matches_allowlist_case_insensitively() {
for f in ["fig.png", "a/b.EPS", "plot.Pdf", "x.svg", "y.JPEG"] {
assert!(is_figure(Utf8Path::new(f)), "{f} should be a figure");
}
for nf in ["main.tex", "refs.bib", "macros.sty", "README"] {
assert!(!is_figure(Utf8Path::new(nf)), "{nf} should NOT be a figure");
}
}
#[test]
fn extract_bundle_all_returns_every_regular_file() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[
("paper.tex", b"\\documentclass{article}"),
("refs.bib", b"@article{x,title={t}}"),
("figs/plot.png", b"\x89PNG\r\n"),
]);
let files = extract_bundle(&id, &payload, BundleFilter::All).expect("bundle");
let mut names: Vec<String> = files
.iter()
.map(|f| f.path.as_str().replace('\\', "/"))
.collect();
names.sort();
assert_eq!(names, vec!["figs/plot.png", "paper.tex", "refs.bib"]);
assert!(files
.iter()
.all(|f| !f.path.as_str().starts_with('/') && !f.path.as_str().contains("..")));
}
#[test]
fn extract_bundle_figures_only_keeps_images() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[
("paper.tex", b"\\documentclass{article}"),
("refs.bib", b"@article{x}"),
("figs/plot.png", b"\x89PNG"),
("diagram.eps", b"%!PS"),
]);
let files = extract_bundle(&id, &payload, BundleFilter::FiguresOnly).expect("figs");
let mut names: Vec<String> = files
.iter()
.map(|f| f.path.as_str().replace('\\', "/"))
.collect();
names.sort();
assert_eq!(names, vec!["diagram.eps", "figs/plot.png"]);
}
#[test]
fn extract_bundle_pdf_only_is_source_unavailable() {
let id = make_id("2401.12345");
let err = extract_bundle(&id, b"%PDF-1.5 x", BundleFilter::All).expect_err("pdf-only");
assert!(matches!(err, FetchError::SourceUnavailable { .. }));
}
#[test]
fn extract_bundle_bare_file_is_source_unavailable() {
let id = make_id("2401.12345");
let err = extract_bundle(&id, b"\\documentclass{article}\nhi", BundleFilter::All)
.expect_err("bare file is not a bundle");
assert!(matches!(err, FetchError::SourceUnavailable { .. }));
}
#[test]
fn extract_bundle_figures_only_none_present_is_source_unavailable() {
let id = make_id("2401.12345");
let payload = tar_gzip(&[("paper.tex", b"\\documentclass{article}")]);
let err = extract_bundle(&id, &payload, BundleFilter::FiguresOnly).expect_err("no figures");
assert!(matches!(err, FetchError::SourceUnavailable { .. }));
}
#[test]
fn extract_bundle_drops_traversal_entry_via_sanitizer() {
fn ustar_block(name: &str, data: &[u8]) -> Vec<u8> {
let mut h = vec![0u8; 512];
h[..name.len()].copy_from_slice(name.as_bytes());
h[100..108].copy_from_slice(b"0000644\0");
h[108..116].copy_from_slice(b"0000000\0");
h[116..124].copy_from_slice(b"0000000\0");
h[124..136].copy_from_slice(format!("{:011o}\0", data.len()).as_bytes());
h[136..148].copy_from_slice(b"00000000000\0");
h[148..156].copy_from_slice(b" "); h[156] = b'0'; h[257..263].copy_from_slice(b"ustar\0");
h[263..265].copy_from_slice(b"00");
let sum: u32 = h.iter().map(|&b| u32::from(b)).sum();
h[148..156].copy_from_slice(format!("{sum:06o}\0 ").as_bytes());
h.extend_from_slice(data);
let pad = (512 - data.len() % 512) % 512;
h.resize(h.len() + pad, 0u8);
h
}
let id = make_id("2401.12345");
let mut tar = ustar_block("../evil.tex", b"evil");
tar.extend(ustar_block("safe.tex", b"\\documentclass{article}"));
tar.resize(tar.len() + 1024, 0u8); let gz = gzip_bytes(&tar);
let files = extract_bundle(&id, &gz, BundleFilter::All).expect("bundle");
let names: Vec<String> = files
.iter()
.map(|f| f.path.as_str().replace('\\', "/"))
.collect();
assert!(
names.iter().all(|n| !n.contains("..")),
"traversal entry must be rejected; got {names:?}"
);
assert!(
names.iter().any(|n| n == "safe.tex"),
"benign sibling must survive; got {names:?}"
);
}
}