use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use super::http::{HttpClient, HttpError};
pub const DEFAULT_CACHE_DIR: &str = "data";
pub const SEED_CACHE_DIR: &str = "data/seed/api-cache";
pub const MAX_SEED_LINES_PER_FILE: usize = 1500;
pub const MAX_SEED_RECORDS_PER_BUCKET: usize = 128;
pub struct CachedHttpClient<T: HttpClient> {
cache_dir: PathBuf,
transport: T,
online: bool,
}
impl<T: HttpClient> CachedHttpClient<T> {
pub fn new(cache_dir: impl Into<PathBuf>, transport: T) -> Self {
Self {
cache_dir: cache_dir.into(),
transport,
online: live_api_enabled(),
}
}
#[must_use]
#[allow(dead_code)]
pub const fn with_online(mut self, online: bool) -> Self {
self.online = online;
self
}
#[must_use]
#[allow(dead_code)]
pub fn cache_dir(&self) -> &Path {
&self.cache_dir
}
#[must_use]
#[allow(dead_code)]
pub const fn is_online(&self) -> bool {
self.online
}
fn cache_paths(&self, url: &str) -> (PathBuf, PathBuf) {
let location = cache_location(url);
let mut body = self.cache_dir.clone();
body.push(&location.directory);
body.push(format!("{}.body", location.stem));
let mut meta = self.cache_dir.clone();
meta.push(&location.directory);
meta.push(format!("{}.url", location.stem));
(body, meta)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CacheLocation {
pub directory: PathBuf,
pub stem: String,
}
#[must_use]
pub fn cache_location(url: &str) -> CacheLocation {
if let Some(location) = classify_wiktionary(url) {
return location;
}
if let Some(location) = classify_wikidata(url) {
return location;
}
CacheLocation {
directory: PathBuf::from("http-cache").join("misc"),
stem: cache_key(url),
}
}
fn classify_wiktionary(url: &str) -> Option<CacheLocation> {
let host_start = url.find("://")? + 3;
let after_scheme = &url[host_start..];
let dot = after_scheme.find('.')?;
let host_end = after_scheme.find('/').unwrap_or(after_scheme.len());
let host_rest = &after_scheme[dot..host_end];
if !host_rest.starts_with(".wiktionary.org") {
return None;
}
let lang = sanitize_segment(&after_scheme[..dot]);
let page = wiktionary_page_from_url(url).unwrap_or_else(|| cache_key(url));
Some(CacheLocation {
directory: PathBuf::from("wiktionary-cache").join(lang),
stem: page,
})
}
fn wiktionary_page_from_url(url: &str) -> Option<String> {
let query_start = url.find('?')?;
let query = &url[query_start + 1..];
for pair in query.split('&') {
if let Some(value) = pair.strip_prefix("page=") {
let decoded = percent_decode(value);
if !decoded.is_empty() {
return Some(sanitize_segment(&decoded));
}
}
}
None
}
fn classify_wikidata(url: &str) -> Option<CacheLocation> {
if !url.contains("wikidata.org") {
return None;
}
let query_start = url.find('?')?;
let query = &url[query_start + 1..];
let mut action: Option<String> = None;
let mut srsearch: Option<String> = None;
let mut ids: Option<String> = None;
let mut sparql: Option<String> = None;
let mut titles: Option<String> = None;
let mut search_term: Option<String> = None;
for pair in query.split('&') {
if let Some(value) = pair.strip_prefix("action=") {
action = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("srsearch=") {
srsearch = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("search=") {
search_term = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("ids=") {
ids = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("query=") {
sparql = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("titles=") {
titles = Some(percent_decode(value));
}
}
if sparql.is_some() || url.contains("/sparql") || url.contains("query.wikidata.org") {
return Some(CacheLocation {
directory: PathBuf::from("wikidata-cache").join("sparql"),
stem: cache_key(url),
});
}
let stem = match action.as_deref() {
Some("wbsearchentities") => srsearch
.as_deref()
.or(search_term.as_deref())
.map_or_else(|| cache_key(url), sanitize_segment),
Some("wbgetentities" | "query") => ids
.as_deref()
.or(titles.as_deref())
.map_or_else(|| cache_key(url), sanitize_segment),
_ => cache_key(url),
};
let sub = match action.as_deref() {
Some("wbsearchentities") => "search",
Some("wbgetentities") => "entities",
Some("query") => "query",
_ => "misc",
};
Some(CacheLocation {
directory: PathBuf::from("wikidata-cache").join(sub),
stem,
})
}
fn sanitize_segment(value: &str) -> String {
let mut out = String::with_capacity(value.len());
for ch in value.chars() {
if ch.is_alphanumeric() || matches!(ch, '-' | '_' | '.') {
out.push(ch);
} else if ch == ' ' || ch == '+' {
out.push('_');
} else {
out.push('-');
}
}
if out.len() > 96 {
out.truncate(96);
out.push('~');
out.push_str(&cache_key(value)[..8]);
}
if out.is_empty() {
cache_key(value)
} else {
out
}
}
fn percent_decode(value: &str) -> String {
let bytes = value.as_bytes();
let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
let byte = bytes[i];
if byte == b'%' && i + 2 < bytes.len() {
let hi = hex_nibble(bytes[i + 1]);
let lo = hex_nibble(bytes[i + 2]);
if let (Some(hi), Some(lo)) = (hi, lo) {
out.push((hi << 4) | lo);
i += 3;
continue;
}
}
if byte == b'+' {
out.push(b' ');
} else {
out.push(byte);
}
i += 1;
}
String::from_utf8_lossy(&out).into_owned()
}
const fn hex_nibble(byte: u8) -> Option<u8> {
match byte {
b'0'..=b'9' => Some(byte - b'0'),
b'a'..=b'f' => Some(byte - b'a' + 10),
b'A'..=b'F' => Some(byte - b'A' + 10),
_ => None,
}
}
#[must_use]
pub fn cache_key(url: &str) -> String {
let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
for byte in url.bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(0x0000_0100_0000_01B3);
}
format!("{hash:016x}")
}
fn live_api_enabled() -> bool {
std::env::var("FORMAL_AI_LIVE_API").is_ok_and(|value| {
matches!(
value.trim().to_ascii_lowercase().as_str(),
"1" | "true" | "yes" | "on"
)
})
}
impl<T: HttpClient> HttpClient for CachedHttpClient<T> {
fn get(&self, url: &str) -> Result<String, HttpError> {
if let Some(body) = seed_response(url) {
return Ok(body);
}
let (body_path, meta_path) = self.cache_paths(url);
if let Ok(body) = fs::read_to_string(&body_path) {
return Ok(body);
}
if !self.online {
return Err(HttpError::Transport(format!(
"translation cache miss for {url} and offline mode is active; \
set FORMAL_AI_LIVE_API=1 to fetch and populate the cache",
)));
}
let body = self.transport.get(url)?;
let parent = body_path.parent().unwrap_or(&self.cache_dir);
if let Err(error) = fs::create_dir_all(parent) {
return Err(HttpError::Transport(format!(
"failed to create cache directory {}: {error}",
parent.display(),
)));
}
if let Err(error) = fs::write(&body_path, &body) {
return Err(HttpError::Transport(format!(
"failed to write cache body {}: {error}",
body_path.display(),
)));
}
if let Err(error) = fs::write(&meta_path, url) {
return Err(HttpError::Transport(format!(
"failed to write cache url marker {}: {error}",
meta_path.display(),
)));
}
Ok(body)
}
}
include!(concat!(env!("OUT_DIR"), "/seed_bundle_files.rs"));
#[must_use]
pub fn seed_files() -> Vec<(&'static str, &'static str)> {
SEED_BUNDLE_FILES.to_vec()
}
#[must_use]
pub fn seed_response(url: &str) -> Option<String> {
seed_index().get(url).cloned()
}
fn seed_index() -> &'static HashMap<String, String> {
static INDEX: OnceLock<HashMap<String, String>> = OnceLock::new();
INDEX.get_or_init(|| {
let mut chunks: HashMap<String, String> = HashMap::new();
let mut order: Vec<String> = Vec::new();
for (_name, contents) in seed_files() {
for (url, body) in parse_seed_chunks(contents) {
let entry = chunks.entry(url.clone()).or_insert_with(|| {
order.push(url.clone());
String::new()
});
entry.push_str(&body);
}
}
let mut index = HashMap::new();
for url in order {
if let Some(body) = chunks.remove(&url) {
index.insert(url, body);
}
}
index
})
}
#[must_use]
pub fn parse_seed_bundle(text: &str) -> Vec<(String, String)> {
parse_seed_chunks(text)
}
#[must_use]
pub fn parse_seed_chunks(text: &str) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = Vec::new();
let mut current_url: Option<String> = None;
let mut current_body: String = String::new();
let flush = |url: &mut Option<String>, body: &mut String, out: &mut Vec<(String, String)>| {
if let Some(url_value) = url.take() {
if body.is_empty() {
body.clear();
} else {
out.push((url_value, std::mem::take(body)));
}
}
};
for raw_line in text.lines() {
let trimmed = raw_line.trim_end_matches(['\r', '\n']);
if trimmed.trim().is_empty() {
continue;
}
let indent = trimmed.bytes().take_while(|b| *b == b' ').count();
let content = &trimmed[indent..];
if indent == 0 {
flush(&mut current_url, &mut current_body, &mut out);
if content.starts_with("response_") {
current_url = Some(String::new());
}
continue;
}
if current_url.is_none() {
continue;
}
if let Some(value) = strip_kv(content, "url") {
current_url = Some(unescape_lino_string(value));
} else if let Some(value) = strip_kv(content, "body") {
current_body.push_str(&unescape_lino_string(value));
}
}
flush(&mut current_url, &mut current_body, &mut out);
out
}
fn strip_kv<'a>(content: &'a str, key: &str) -> Option<&'a str> {
let rest = content.strip_prefix(key)?;
let rest = rest.strip_prefix(' ')?;
let rest = rest.strip_prefix('"')?;
rest.strip_suffix('"')
}
pub const SEED_BODY_CHUNK_CHARS: usize = 200;
#[must_use]
pub fn escape_lino_string(input: &str) -> String {
let mut out = String::with_capacity(input.len() + 8);
for ch in input.chars() {
if ch == '"' {
out.push('"');
out.push('"');
} else {
out.push(ch);
}
}
out
}
#[must_use]
pub fn unescape_lino_string(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut chars = input.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '"' && chars.peek() == Some(&'"') {
out.push('"');
chars.next();
} else {
out.push(ch);
}
}
out
}
#[must_use]
pub fn split_body_into_chunks(body: &str, chars: usize) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
if body.is_empty() {
return out;
}
let chars_vec: Vec<char> = body.chars().collect();
let total = chars_vec.len();
let mut start = 0usize;
while start < total {
let mut end = (start + chars).min(total);
while end < total && chars_vec[end] == '"' {
end += 1;
}
out.push(chars_vec[start..end].iter().collect());
start = end;
}
out
}
pub fn write_seed_record(out: &mut String, short_id: &str, url: &str, body: &str) {
out.push_str("response_");
out.push_str(short_id);
out.push('\n');
out.push_str(" url \"");
out.push_str(&escape_lino_string(url));
out.push_str("\"\n");
for chunk in split_body_into_chunks(body, SEED_BODY_CHUNK_CHARS) {
out.push_str(" body \"");
out.push_str(&escape_lino_string(&chunk));
out.push_str("\"\n");
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap as StdHashMap;
use std::sync::Mutex;
struct StubHttp {
responses: Mutex<StdHashMap<String, String>>,
calls: Mutex<Vec<String>>,
}
impl StubHttp {
fn new(responses: &[(&str, &str)]) -> Self {
Self {
responses: Mutex::new(
responses
.iter()
.map(|(k, v)| ((*k).to_owned(), (*v).to_owned()))
.collect(),
),
calls: Mutex::new(Vec::new()),
}
}
}
impl HttpClient for StubHttp {
fn get(&self, url: &str) -> Result<String, HttpError> {
self.calls.lock().unwrap().push(url.to_owned());
self.responses
.lock()
.unwrap()
.get(url)
.cloned()
.ok_or_else(|| HttpError::Status {
status: 404,
body: format!("stub had no response for {url}"),
})
}
}
fn temp_dir(slug: &str) -> PathBuf {
let mut dir = std::env::temp_dir();
dir.push(format!(
"formal-ai-cache-{slug}-{}",
std::process::id() ^ rand_u32()
));
let _ = fs::create_dir_all(&dir);
dir
}
fn rand_u32() -> u32 {
use std::time::{SystemTime, UNIX_EPOCH};
u32::try_from(
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map_or(0, |d| u128::from(d.subsec_nanos())),
)
.unwrap_or(0)
}
#[test]
fn cache_key_is_stable_across_runs() {
let one = cache_key("https://example.com/foo");
let two = cache_key("https://example.com/foo");
assert_eq!(one, two);
let other = cache_key("https://example.com/bar");
assert_ne!(one, other);
}
#[test]
fn cache_hit_short_circuits_transport() {
let dir = temp_dir("hit");
let cache = CachedHttpClient::new(&dir, StubHttp::new(&[])).with_online(false);
let url = "https://example.com/cached";
let (body_path, meta_path) = cache.cache_paths(url);
fs::create_dir_all(body_path.parent().unwrap()).unwrap();
fs::write(&body_path, "cached body").unwrap();
fs::write(&meta_path, url).unwrap();
assert_eq!(cache.get(url).unwrap(), "cached body");
}
#[test]
fn cache_miss_offline_returns_transport_error() {
let dir = temp_dir("offline-miss");
let cache = CachedHttpClient::new(&dir, StubHttp::new(&[])).with_online(false);
let error = cache.get("https://example.com/missing").unwrap_err();
match error {
HttpError::Transport(message) => {
assert!(message.contains("cache miss"), "got: {message}");
assert!(message.contains("FORMAL_AI_LIVE_API"), "got: {message}");
}
other @ HttpError::Status { .. } => {
panic!("expected Transport error, got {other:?}")
}
}
}
#[test]
fn cache_miss_online_populates_and_returns_body() {
let dir = temp_dir("online-miss");
let url = "https://example.com/foo";
let stub = StubHttp::new(&[(url, "fetched body")]);
let cache = CachedHttpClient::new(&dir, stub).with_online(true);
assert_eq!(cache.get(url).unwrap(), "fetched body");
let again = CachedHttpClient::new(&dir, StubHttp::new(&[])).with_online(false);
assert_eq!(again.get(url).unwrap(), "fetched body");
}
#[test]
fn cache_paths_use_semantic_subdirectories() {
let dir = PathBuf::from("cache-root");
let cache = CachedHttpClient::new(&dir, StubHttp::new(&[])).with_online(false);
let (body, meta) = cache.cache_paths("https://example.com/x");
assert!(
body.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("body")),
"got: {}",
body.display()
);
assert!(
meta.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("url")),
"got: {}",
meta.display()
);
let location = cache_location("https://example.com/x");
assert_eq!(location.directory, PathBuf::from("http-cache").join("misc"),);
assert!(!location.stem.is_empty());
}
#[test]
fn wiktionary_url_lands_under_per_language_subdirectory() {
let location = cache_location(
"https://en.wiktionary.org/w/api.php?action=parse&page=apple&prop=wikitext&formatversion=2&format=json&redirects=1",
);
assert_eq!(
location.directory,
PathBuf::from("wiktionary-cache").join("en")
);
assert_eq!(location.stem, "apple");
}
#[test]
fn wikidata_search_url_keyed_by_search_term() {
let location = cache_location(
"https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&type=lexeme&srsearch=apple&limit=3",
);
assert_eq!(
location.directory,
PathBuf::from("wikidata-cache").join("search")
);
assert_eq!(location.stem, "apple");
}
#[test]
fn wikidata_sparql_url_lands_in_sparql_bucket() {
let location = cache_location(
"https://query.wikidata.org/sparql?format=json&query=SELECT%20%3Flemma%20WHERE%20%7B%20%7D",
);
assert_eq!(
location.directory,
PathBuf::from("wikidata-cache").join("sparql")
);
assert_eq!(location.stem.len(), 16);
}
#[test]
fn parse_seed_bundle_round_trips_through_write_seed_record() {
let body = r#"{"parse":{"title":"apple","wikitext":"* Russian: {{t+|ru|яблоко}}"}}"#;
let url = "https://en.wiktionary.org/w/api.php?action=parse&page=apple&prop=wikitext&formatversion=2&format=json&redirects=1";
let mut buf = String::new();
write_seed_record(&mut buf, "wiktionary_en_apple", url, body);
let parsed = parse_seed_bundle(&buf);
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].0, url);
assert_eq!(parsed[0].1, body);
}
#[test]
fn parse_seed_bundle_concatenates_chunked_body() {
let bundle =
"response_chunky\n url \"https://example.org/x\"\n body \"hel\"\n body \"lo\"\n";
let parsed = parse_seed_bundle(bundle);
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].1, "hello");
}
#[test]
fn parse_seed_chunks_yields_one_pair_per_record() {
let bundle = "response_a\n url \"https://example.org/x\"\n body \"hel\"\n\nresponse_b\n url \"https://example.org/x\"\n body \"lo\"\n";
let chunks = parse_seed_chunks(bundle);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].0, "https://example.org/x");
assert_eq!(chunks[0].1, "hel");
assert_eq!(chunks[1].0, "https://example.org/x");
assert_eq!(chunks[1].1, "lo");
}
#[test]
fn escape_round_trips_quotes_backslashes_and_unicode() {
let cases: &[&str] = &[
"",
"plain ascii",
"with \"quotes\"",
"with \\backslash\\",
"{\"wikitext\":\"== {{-ru-}} ==\\n=== {{з|}} ===\"}",
"яблоко 苹果 🍎",
"trailing-quote\"",
"leading-quote: \"abc",
"double\"\"middle",
];
for case in cases {
let escaped = escape_lino_string(case);
let back = unescape_lino_string(&escaped);
assert_eq!(back, *case, "round trip failed for {case:?}");
}
}
#[test]
fn split_body_into_chunks_respects_char_boundaries() {
let body = "яблоко 苹果 🍎"; let chunks = split_body_into_chunks(body, 4);
let recombined: String = chunks.concat();
assert_eq!(recombined, body);
for chunk in &chunks {
assert!(chunk.chars().count() <= 4);
}
}
#[test]
fn split_body_into_chunks_never_starts_chunk_with_quote() {
let body = "aaaa\"bbbb\"cccc";
let chunks = split_body_into_chunks(body, 4);
for (idx, chunk) in chunks.iter().enumerate() {
assert!(
!chunk.starts_with('"'),
"chunk[{idx}] starts with a quote: {chunk:?}",
);
}
let recombined: String = chunks.concat();
assert_eq!(recombined, body);
}
#[test]
fn escaped_record_parses_as_links_notation() {
let mut buf = String::new();
write_seed_record(
&mut buf,
"demo",
"https://example.org/q",
r#"{"key":"value with \"escaped\" quotes","arr":[""]}"#,
);
lino_objects_codec::format::parse_indented(buf.trim()).unwrap_or_else(|error| {
panic!("record should be valid Links Notation: {error}\nbuffer:\n{buf}");
});
}
#[test]
fn seed_files_stay_under_per_file_line_cap() {
for (name, contents) in seed_files() {
let lines = contents.lines().count();
assert!(
lines <= MAX_SEED_LINES_PER_FILE,
"{name} has {lines} lines, exceeds MAX_SEED_LINES_PER_FILE={MAX_SEED_LINES_PER_FILE}",
);
}
}
#[test]
fn seed_response_is_consulted_before_disk_or_transport() {
let any_url = seed_index().keys().next().cloned();
let Some(url) = any_url else {
return;
};
let dir = temp_dir("seed-precedence");
let cache = CachedHttpClient::new(&dir, StubHttp::new(&[])).with_online(false);
let body = cache.get(&url).expect("seeded response must hit");
assert!(!body.is_empty());
}
}