use std::sync::LazyLock;
use anyhow::{Result, anyhow};
use serde_json::Value as Json;
use crate::config::ScholarlyConfig;
use crate::sources::BibEntry;
#[derive(Debug, Clone)]
pub(super) struct Paper {
pub source: &'static str, pub id: String, pub doi: String, pub title: String,
pub authors: Vec<String>,
pub year: String,
pub abstract_: String,
pub url: String,
}
impl Paper {
pub(super) fn cite_detail(&self) -> String {
if !self.doi.is_empty() {
format!("doi:{}", self.doi)
} else {
format!("{}:{}", self.source, self.id)
}
}
fn cite_key(&self) -> String {
let surname = self
.authors
.first()
.and_then(|a| a.split_whitespace().last())
.unwrap_or(self.source);
let base: String = format!("{surname}{}", self.year)
.chars()
.filter(|c| c.is_ascii_alphanumeric())
.collect::<String>()
.to_lowercase();
if base.is_empty() { format!("{}-{}", self.source, self.id) } else { base }
}
pub(super) fn to_bibentry(&self) -> BibEntry {
BibEntry {
key: self.cite_key(),
entry_type: "article".to_string(),
author: self.authors.join(" and "),
title: self.title.clone(),
year: self.year.clone(),
doi: (!self.doi.is_empty()).then(|| self.doi.clone()),
url: (!self.url.is_empty()).then(|| self.url.clone()),
note: Some(format!("{} {}", self.source, self.id)),
abstract_: (!self.abstract_.is_empty()).then(|| truncate(&self.abstract_, 2000)),
..Default::default()
}
}
}
pub(super) fn available(cfg: &ScholarlyConfig) -> bool {
cfg.enabled
}
pub(super) async fn openalex(cfg: ScholarlyConfig, query: String) -> Result<Paper> {
let mut q: Vec<(&str, String)> =
vec![("search", query.clone()), ("per_page", "1".to_string())];
if !cfg.mailto.trim().is_empty() {
q.push(("mailto", cfg.mailto.trim().to_string()));
}
let json: Json = client()?
.get("https://api.openalex.org/works")
.query(&q)
.send()
.await
.map_err(|e| anyhow!("openalex request: {e}"))?
.json()
.await
.map_err(|e| anyhow!("openalex decode: {e}"))?;
parse_openalex(&json).ok_or_else(|| anyhow!("no OpenAlex result for `{query}`"))
}
fn parse_openalex(json: &Json) -> Option<Paper> {
let w = json.get("results")?.as_array()?.first()?;
let id = strip_prefix_url(w.get("id")?.as_str()?, "https://openalex.org/");
let doi = w
.get("doi")
.and_then(|d| d.as_str())
.map(|d| strip_prefix_url(d, "https://doi.org/"))
.unwrap_or_default();
let title = w
.get("display_name")
.or_else(|| w.get("title"))
.and_then(|t| t.as_str())
.unwrap_or("")
.to_string();
let year = w.get("publication_year").and_then(|y| y.as_i64()).map(|y| y.to_string()).unwrap_or_default();
let authors: Vec<String> = w
.get("authorships")
.and_then(|a| a.as_array())
.map(|arr| {
arr.iter()
.filter_map(|a| a.get("author")?.get("display_name")?.as_str().map(str::to_string))
.collect()
})
.unwrap_or_default();
let abstract_ = w
.get("abstract_inverted_index")
.filter(|v| !v.is_null())
.map(reconstruct_abstract)
.unwrap_or_default();
let url = w
.get("primary_location")
.and_then(|l| l.get("landing_page_url"))
.and_then(|u| u.as_str())
.map(str::to_string)
.unwrap_or_else(|| {
if !doi.is_empty() { format!("https://doi.org/{doi}") } else { format!("https://openalex.org/{id}") }
});
Some(Paper { source: "openalex", id, doi, title, authors, year, abstract_, url })
}
fn reconstruct_abstract(inv: &Json) -> String {
let Some(obj) = inv.as_object() else { return String::new() };
let mut positioned: Vec<(u64, &str)> = Vec::new();
for (word, positions) in obj {
if let Some(arr) = positions.as_array() {
for p in arr {
if let Some(i) = p.as_u64() {
positioned.push((i, word.as_str()));
}
}
}
}
positioned.sort_by_key(|(i, _)| *i);
positioned.iter().map(|(_, w)| *w).collect::<Vec<_>>().join(" ")
}
pub(super) async fn arxiv(query: String) -> Result<Paper> {
let xml = client()?
.get("https://export.arxiv.org/api/query")
.query(&[("search_query", format!("all:{query}")), ("max_results", "1".to_string())])
.send()
.await
.map_err(|e| anyhow!("arxiv request: {e}"))?
.text()
.await
.map_err(|e| anyhow!("arxiv decode: {e}"))?;
parse_arxiv_atom(&xml).ok_or_else(|| anyhow!("no arXiv result for `{query}`"))
}
static ENTRY: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<entry>(.*?)</entry>").unwrap());
static TAG_TITLE: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<title>(.*?)</title>").unwrap());
static TAG_SUMMARY: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<summary>(.*?)</summary>").unwrap());
static TAG_PUBLISHED: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<published>(.*?)</published>").unwrap());
static TAG_ID: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<id>(.*?)</id>").unwrap());
static TAG_NAME: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<name>(.*?)</name>").unwrap());
static TAG_DOI: LazyLock<regex::Regex> =
LazyLock::new(|| regex::Regex::new(r"(?s)<arxiv:doi[^>]*>(.*?)</arxiv:doi>").unwrap());
static WS: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new(r"\s+").unwrap());
fn parse_arxiv_atom(xml: &str) -> Option<Paper> {
let entry = ENTRY.captures(xml)?.get(1)?.as_str();
let title = cap(&TAG_TITLE, entry).unwrap_or_default();
let abstract_ = cap(&TAG_SUMMARY, entry).unwrap_or_default();
let published = cap(&TAG_PUBLISHED, entry).unwrap_or_default();
let year = published.chars().take(4).collect::<String>();
let id_url = cap(&TAG_ID, entry).unwrap_or_default();
let id = id_url.rsplit("/abs/").next().unwrap_or(&id_url).to_string();
let doi = cap(&TAG_DOI, entry).unwrap_or_default();
let authors: Vec<String> =
TAG_NAME.captures_iter(entry).filter_map(|c| c.get(1).map(|m| clean(m.as_str()))).collect();
if title.is_empty() {
return None;
}
Some(Paper { source: "arxiv", id, doi, title, authors, year, abstract_, url: id_url })
}
fn cap(re: ®ex::Regex, s: &str) -> Option<String> {
re.captures(s).and_then(|c| c.get(1)).map(|m| clean(m.as_str()))
}
fn clean(s: &str) -> String {
let d = s
.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'");
WS.replace_all(&d, " ").trim().to_string()
}
fn client() -> Result<reqwest::Client> {
reqwest::Client::builder()
.user_agent("inkhaven-research/1.0 (https://crates.io/crates/inkhaven)")
.build()
.map_err(|e| anyhow!("http client: {e}"))
}
fn strip_prefix_url(s: &str, prefix: &str) -> String {
s.strip_prefix(prefix).unwrap_or(s).to_string()
}
fn truncate(s: &str, max: usize) -> String {
if s.chars().count() <= max { s.to_string() } else { s.chars().take(max).collect::<String>() + "…" }
}
pub(super) fn render(p: &Paper) -> String {
let mut s = p.title.clone();
s.push('\n');
let who = if p.authors.is_empty() { "unknown authors".to_string() } else { p.authors.join(", ") };
s.push_str(&format!("{who}{}\n", if p.year.is_empty() { String::new() } else { format!(" · {}", p.year) }));
let ident = if !p.doi.is_empty() { format!("doi:{}", p.doi) } else { format!("{}:{}", p.source, p.id) };
s.push_str(&format!("{ident}\n"));
if !p.abstract_.is_empty() {
s.push_str(&format!("\n{}\n", truncate(&p.abstract_, 1200)));
}
s.push_str(&format!("\nSource: {} · {}", p.source, p.url));
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn openalex_parse_and_abstract() {
let j = serde_json::json!({"results":[{
"id":"https://openalex.org/W123","doi":"https://doi.org/10.1/x",
"display_name":"Roman Aqueducts","publication_year":2012,
"authorships":[{"author":{"display_name":"Jane Roe"}},{"author":{"display_name":"John Doe"}}],
"abstract_inverted_index":{"Water":[0],"flows":[1],"downhill":[2]},
"primary_location":{"landing_page_url":"https://example.org/a"}
}]});
let p = parse_openalex(&j).unwrap();
assert_eq!(p.id, "W123");
assert_eq!(p.doi, "10.1/x");
assert_eq!(p.year, "2012");
assert_eq!(p.authors, vec!["Jane Roe", "John Doe"]);
assert_eq!(p.abstract_, "Water flows downhill");
assert_eq!(p.cite_detail(), "doi:10.1/x");
let b = p.to_bibentry();
assert_eq!(b.key, "roe2012");
assert_eq!(b.author, "Jane Roe and John Doe");
assert_eq!(b.doi.as_deref(), Some("10.1/x"));
assert!(b.is_valid());
}
#[test]
fn arxiv_atom_parse() {
let xml = r#"<feed><title>ArXiv Query</title>
<entry>
<id>http://arxiv.org/abs/1706.03762v5</id>
<published>2017-06-12T17:57:34Z</published>
<title>Attention Is All You Need</title>
<summary>The dominant sequence models use recurrence.</summary>
<author><name>Ashish Vaswani</name></author>
<author><name>Noam Shazeer</name></author>
</entry></feed>"#;
let p = parse_arxiv_atom(xml).unwrap();
assert_eq!(p.source, "arxiv");
assert_eq!(p.id, "1706.03762v5");
assert_eq!(p.year, "2017");
assert_eq!(p.title, "Attention Is All You Need");
assert_eq!(p.authors, vec!["Ashish Vaswani", "Noam Shazeer"]);
assert!(p.abstract_.contains("recurrence"));
assert_eq!(p.cite_detail(), "arxiv:1706.03762v5");
assert_eq!(p.to_bibentry().key, "vaswani2017");
}
#[test]
fn render_has_identifier_and_source() {
let p = Paper {
source: "arxiv", id: "1.2".into(), doi: String::new(), title: "T".into(),
authors: vec!["A B".into()], year: "2020".into(), abstract_: "x".into(),
url: "http://a".into(),
};
let r = render(&p);
assert!(r.contains("arxiv:1.2"));
assert!(r.contains("Source: arxiv"));
}
}