use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use super::http::{HttpClient, HttpError};
pub const DEFAULT_CACHE_DIR: &str = "data";
pub const SEED_CACHE_DIR: &str = "data/seed/api-cache";
pub const MAX_SEED_LINES_PER_FILE: usize = 1500;
pub const MAX_SEED_RECORDS_PER_BUCKET: usize = 128;
pub struct CachedHttpClient<T: HttpClient> {
cache_dir: PathBuf,
transport: T,
online: bool,
}
impl<T: HttpClient> CachedHttpClient<T> {
pub fn new(cache_dir: impl Into<PathBuf>, transport: T) -> Self {
Self {
cache_dir: cache_dir.into(),
transport,
online: live_api_enabled(),
}
}
#[must_use]
#[allow(dead_code)]
pub const fn with_online(mut self, online: bool) -> Self {
self.online = online;
self
}
#[must_use]
#[allow(dead_code)]
pub fn cache_dir(&self) -> &Path {
&self.cache_dir
}
#[must_use]
#[allow(dead_code)]
pub const fn is_online(&self) -> bool {
self.online
}
fn cache_paths(&self, url: &str) -> (PathBuf, PathBuf) {
let location = cache_location(url);
let mut body = self.cache_dir.clone();
body.push(&location.directory);
body.push(format!("{}.body", location.stem));
let mut meta = self.cache_dir.clone();
meta.push(&location.directory);
meta.push(format!("{}.url", location.stem));
(body, meta)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CacheLocation {
pub directory: PathBuf,
pub stem: String,
}
#[must_use]
pub fn cache_location(url: &str) -> CacheLocation {
if let Some(location) = classify_wiktionary(url) {
return location;
}
if let Some(location) = classify_wikidata(url) {
return location;
}
CacheLocation {
directory: PathBuf::from("http-cache").join("misc"),
stem: cache_key(url),
}
}
fn classify_wiktionary(url: &str) -> Option<CacheLocation> {
let host_start = url.find("://")? + 3;
let after_scheme = &url[host_start..];
let dot = after_scheme.find('.')?;
let host_end = after_scheme.find('/').unwrap_or(after_scheme.len());
let host_rest = &after_scheme[dot..host_end];
if !host_rest.starts_with(".wiktionary.org") {
return None;
}
let lang = sanitize_segment(&after_scheme[..dot]);
let page = wiktionary_page_from_url(url).unwrap_or_else(|| cache_key(url));
Some(CacheLocation {
directory: PathBuf::from("wiktionary-cache").join(lang),
stem: page,
})
}
fn wiktionary_page_from_url(url: &str) -> Option<String> {
let query_start = url.find('?')?;
let query = &url[query_start + 1..];
for pair in query.split('&') {
if let Some(value) = pair.strip_prefix("page=") {
let decoded = percent_decode(value);
if !decoded.is_empty() {
return Some(sanitize_segment(&decoded));
}
}
}
None
}
fn classify_wikidata(url: &str) -> Option<CacheLocation> {
if !url.contains("wikidata.org") {
return None;
}
let query_start = url.find('?')?;
let query = &url[query_start + 1..];
let mut action: Option<String> = None;
let mut srsearch: Option<String> = None;
let mut ids: Option<String> = None;
let mut sparql: Option<String> = None;
let mut titles: Option<String> = None;
let mut search_term: Option<String> = None;
for pair in query.split('&') {
if let Some(value) = pair.strip_prefix("action=") {
action = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("srsearch=") {
srsearch = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("search=") {
search_term = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("ids=") {
ids = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("query=") {
sparql = Some(percent_decode(value));
} else if let Some(value) = pair.strip_prefix("titles=") {
titles = Some(percent_decode(value));
}
}
if sparql.is_some() || url.contains("/sparql") || url.contains("query.wikidata.org") {
return Some(CacheLocation {
directory: PathBuf::from("wikidata-cache").join("sparql"),
stem: cache_key(url),
});
}
let stem = match action.as_deref() {
Some("wbsearchentities") => srsearch
.as_deref()
.or(search_term.as_deref())
.map_or_else(|| cache_key(url), sanitize_segment),
Some("wbgetentities" | "query") => ids
.as_deref()
.or(titles.as_deref())
.map_or_else(|| cache_key(url), sanitize_segment),
_ => cache_key(url),
};
let sub = match action.as_deref() {
Some("wbsearchentities") => "search",
Some("wbgetentities") => "entities",
Some("query") => "query",
_ => "misc",
};
Some(CacheLocation {
directory: PathBuf::from("wikidata-cache").join(sub),
stem,
})
}
fn sanitize_segment(value: &str) -> String {
let mut out = String::with_capacity(value.len());
for ch in value.chars() {
if ch.is_alphanumeric() || matches!(ch, '-' | '_' | '.') {
out.push(ch);
} else if ch == ' ' || ch == '+' {
out.push('_');
} else {
out.push('-');
}
}
if out.len() > 96 {
out.truncate(96);
out.push('~');
out.push_str(&cache_key(value)[..8]);
}
if out.is_empty() {
cache_key(value)
} else {
out
}
}
fn percent_decode(value: &str) -> String {
let bytes = value.as_bytes();
let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
let byte = bytes[i];
if byte == b'%' && i + 2 < bytes.len() {
let hi = hex_nibble(bytes[i + 1]);
let lo = hex_nibble(bytes[i + 2]);
if let (Some(hi), Some(lo)) = (hi, lo) {
out.push((hi << 4) | lo);
i += 3;
continue;
}
}
if byte == b'+' {
out.push(b' ');
} else {
out.push(byte);
}
i += 1;
}
String::from_utf8_lossy(&out).into_owned()
}
const fn hex_nibble(byte: u8) -> Option<u8> {
match byte {
b'0'..=b'9' => Some(byte - b'0'),
b'a'..=b'f' => Some(byte - b'a' + 10),
b'A'..=b'F' => Some(byte - b'A' + 10),
_ => None,
}
}
#[must_use]
pub fn cache_key(url: &str) -> String {
let mut hash: u64 = 0xcbf2_9ce4_8422_2325;
for byte in url.bytes() {
hash ^= u64::from(byte);
hash = hash.wrapping_mul(0x0000_0100_0000_01B3);
}
format!("{hash:016x}")
}
fn live_api_enabled() -> bool {
std::env::var("FORMAL_AI_LIVE_API").is_ok_and(|value| {
matches!(
value.trim().to_ascii_lowercase().as_str(),
"1" | "true" | "yes" | "on"
)
})
}
impl<T: HttpClient> HttpClient for CachedHttpClient<T> {
fn get(&self, url: &str) -> Result<String, HttpError> {
if let Some(body) = seed_response(url) {
return Ok(body);
}
let (body_path, meta_path) = self.cache_paths(url);
if let Ok(body) = fs::read_to_string(&body_path) {
return Ok(body);
}
if !self.online {
return Err(HttpError::Transport(format!(
"translation cache miss for {url} and offline mode is active; \
set FORMAL_AI_LIVE_API=1 to fetch and populate the cache",
)));
}
let body = self.transport.get(url)?;
let parent = body_path.parent().unwrap_or(&self.cache_dir);
if let Err(error) = fs::create_dir_all(parent) {
return Err(HttpError::Transport(format!(
"failed to create cache directory {}: {error}",
parent.display(),
)));
}
if let Err(error) = fs::write(&body_path, &body) {
return Err(HttpError::Transport(format!(
"failed to write cache body {}: {error}",
body_path.display(),
)));
}
if let Err(error) = fs::write(&meta_path, url) {
return Err(HttpError::Transport(format!(
"failed to write cache url marker {}: {error}",
meta_path.display(),
)));
}
Ok(body)
}
}
include!(concat!(env!("OUT_DIR"), "/seed_bundle_files.rs"));
#[must_use]
pub fn seed_files() -> Vec<(&'static str, &'static str)> {
SEED_BUNDLE_FILES.to_vec()
}
#[must_use]
pub fn seed_response(url: &str) -> Option<String> {
seed_index().get(url).cloned()
}
fn seed_index() -> &'static HashMap<String, String> {
static INDEX: OnceLock<HashMap<String, String>> = OnceLock::new();
INDEX.get_or_init(|| {
let mut chunks: HashMap<String, String> = HashMap::new();
let mut order: Vec<String> = Vec::new();
for (_name, contents) in seed_files() {
for (url, body) in parse_seed_chunks(contents) {
let entry = chunks.entry(url.clone()).or_insert_with(|| {
order.push(url.clone());
String::new()
});
entry.push_str(&body);
}
}
let mut index = HashMap::new();
for url in order {
if let Some(body) = chunks.remove(&url) {
index.insert(url, body);
}
}
index
})
}
#[must_use]
pub fn parse_seed_bundle(text: &str) -> Vec<(String, String)> {
parse_seed_chunks(text)
}
#[must_use]
pub fn parse_seed_chunks(text: &str) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = Vec::new();
let mut current_url: Option<String> = None;
let mut current_body: String = String::new();
let flush = |url: &mut Option<String>, body: &mut String, out: &mut Vec<(String, String)>| {
if let Some(url_value) = url.take() {
if body.is_empty() {
body.clear();
} else {
out.push((url_value, std::mem::take(body)));
}
}
};
for raw_line in text.lines() {
let trimmed = raw_line.trim_end_matches(['\r', '\n']);
if trimmed.trim().is_empty() {
continue;
}
let indent = trimmed.bytes().take_while(|b| *b == b' ').count();
let content = &trimmed[indent..];
if indent == 0 {
flush(&mut current_url, &mut current_body, &mut out);
if content.starts_with("response_") {
current_url = Some(String::new());
}
continue;
}
if current_url.is_none() {
continue;
}
if let Some(value) = strip_kv(content, "url") {
current_url = Some(unescape_lino_string(value));
} else if let Some(value) = strip_kv(content, "body") {
current_body.push_str(&unescape_lino_string(value));
}
}
flush(&mut current_url, &mut current_body, &mut out);
out
}
fn strip_kv<'a>(content: &'a str, key: &str) -> Option<&'a str> {
let rest = content.strip_prefix(key)?;
let rest = rest.strip_prefix(' ')?;
let rest = rest.strip_prefix('"')?;
rest.strip_suffix('"')
}
pub const SEED_BODY_CHUNK_CHARS: usize = 200;
#[must_use]
pub fn escape_lino_string(input: &str) -> String {
let mut out = String::with_capacity(input.len() + 8);
for ch in input.chars() {
if ch == '"' {
out.push('"');
out.push('"');
} else {
out.push(ch);
}
}
out
}
#[must_use]
pub fn unescape_lino_string(input: &str) -> String {
let mut out = String::with_capacity(input.len());
let mut chars = input.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '"' && chars.peek() == Some(&'"') {
out.push('"');
chars.next();
} else {
out.push(ch);
}
}
out
}
#[must_use]
pub fn split_body_into_chunks(body: &str, chars: usize) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
if body.is_empty() {
return out;
}
let chars_vec: Vec<char> = body.chars().collect();
let total = chars_vec.len();
let mut start = 0usize;
while start < total {
let mut end = (start + chars).min(total);
while end < total && chars_vec[end] == '"' {
end += 1;
}
out.push(chars_vec[start..end].iter().collect());
start = end;
}
out
}
pub fn write_seed_record(out: &mut String, short_id: &str, url: &str, body: &str) {
out.push_str("response_");
out.push_str(short_id);
out.push('\n');
out.push_str(" url \"");
out.push_str(&escape_lino_string(url));
out.push_str("\"\n");
for chunk in split_body_into_chunks(body, SEED_BODY_CHUNK_CHARS) {
out.push_str(" body \"");
out.push_str(&escape_lino_string(&chunk));
out.push_str("\"\n");
}
}