use crate::ocr::{OcrAttempt, OcrOutput};
use std::collections::{HashMap, VecDeque};
use std::sync::Mutex;
pub struct OcrCache {
capacity: usize,
inner: Mutex<Inner>,
}
struct Inner {
map: HashMap<[u8; 32], OcrAttemptSnapshot>,
order: VecDeque<[u8; 32]>,
}
#[derive(Clone)]
pub enum OcrAttemptSnapshot {
Disabled,
NoTextFound {
mean_confidence: f32,
regions: usize,
},
Error(String),
Recognized(OcrOutput),
}
impl OcrCache {
pub fn new(capacity: usize) -> Self {
Self {
capacity: capacity.max(1),
inner: Mutex::new(Inner {
map: HashMap::new(),
order: VecDeque::new(),
}),
}
}
pub fn key(bytes: &[u8]) -> [u8; 32] {
#[cfg(feature = "ocr")]
{
sha256_like(bytes)
}
#[cfg(not(feature = "ocr"))]
{
let _ = bytes;
[0u8; 32]
}
}
pub fn get(&self, key: &[u8; 32]) -> Option<OcrAttemptSnapshot> {
let mut guard = self.inner.lock().unwrap_or_else(|p| p.into_inner());
if guard.map.contains_key(key) {
guard.order.retain(|k| k != key);
guard.order.push_back(*key);
return guard.map.get(key).cloned();
}
None
}
pub fn put(&self, key: [u8; 32], value: OcrAttemptSnapshot) {
let mut guard = self.inner.lock().unwrap_or_else(|p| p.into_inner());
if guard.map.insert(key, value).is_none() {
guard.order.push_back(key);
while guard.order.len() > self.capacity {
if let Some(evict) = guard.order.pop_front() {
guard.map.remove(&evict);
}
}
} else {
guard.order.retain(|k| k != &key);
guard.order.push_back(key);
}
}
pub fn len(&self) -> usize {
self.inner
.lock()
.map(|g| g.map.len())
.unwrap_or(0)
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
}
pub fn attempt_to_snapshot(attempt: &OcrAttempt) -> OcrAttemptSnapshot {
match attempt {
OcrAttempt::Disabled => OcrAttemptSnapshot::Disabled,
OcrAttempt::NoTextFound { mean_confidence, regions } => {
OcrAttemptSnapshot::NoTextFound {
mean_confidence: *mean_confidence,
regions: *regions,
}
}
OcrAttempt::Error(msg) => OcrAttemptSnapshot::Error(msg.clone()),
OcrAttempt::Recognized { text, mean_confidence } => {
OcrAttemptSnapshot::Recognized(OcrOutput {
text: text.clone(),
lines: Vec::new(),
mean_confidence: *mean_confidence,
detected_script: crate::ocr::script::dominant_script(text),
})
}
}
}
pub fn snapshot_to_attempt(snap: &OcrAttemptSnapshot) -> OcrAttempt {
match snap {
OcrAttemptSnapshot::Disabled => OcrAttempt::Disabled,
OcrAttemptSnapshot::NoTextFound { mean_confidence, regions } => OcrAttempt::NoTextFound {
mean_confidence: *mean_confidence,
regions: *regions,
},
OcrAttemptSnapshot::Error(msg) => OcrAttempt::Error(msg.clone()),
OcrAttemptSnapshot::Recognized(out) => OcrAttempt::Recognized {
text: out.text.clone(),
mean_confidence: out.mean_confidence,
},
}
}
pub fn shared_cache() -> Option<&'static OcrCache> {
use std::sync::OnceLock;
static CACHE: OnceLock<Option<OcrCache>> = OnceLock::new();
CACHE
.get_or_init(|| {
let disabled = std::env::var("OMNIPARSE_OCR_CACHE")
.map(|v| v == "0" || v.eq_ignore_ascii_case("false"))
.unwrap_or(false);
if disabled {
None
} else {
let cap = std::env::var("OMNIPARSE_OCR_CACHE_SIZE")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(64);
Some(OcrCache::new(cap))
}
})
.as_ref()
}
fn sha256_like(bytes: &[u8]) -> [u8; 32] {
let mut out = [0u8; 32];
let mut state: u64 = 0xcbf2_9ce4_8422_2325;
let prime: u64 = 0x100_0000_01b3;
for (i, chunk) in bytes.chunks(8).enumerate() {
for &b in chunk {
state = (state ^ b as u64).wrapping_mul(prime);
}
let slot = i % 4;
let pos = slot * 8;
for (j, byte) in state.to_le_bytes().iter().enumerate() {
out[pos + j] ^= *byte;
}
}
let len = (bytes.len() as u64).to_le_bytes();
for i in 0..8 {
out[24 + i] ^= len[i];
}
out
}