use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::SystemTime;
use anyhow::{Context, Result};
use bytemuck::{Pod, Zeroable};
use heed::types::{Bytes, U64};
use heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn};
use read_fonts::types::Tag;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use crate::query::{FamilyClassFilter, Query};
use crate::search::{TypgFontFaceMatch, TypgFontFaceMeta};
pub type FontID = u64;
const MAX_DB_SIZE: usize = 10 * 1024 * 1024 * 1024;
const MAX_DBS: u32 = 10;
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct IndexedFontMeta {
pub path: String,
pub ttc_index: Option<u32>,
pub names: Vec<String>,
pub is_variable: bool,
pub weight_class: Option<u16>,
pub width_class: Option<u16>,
pub family_class: Option<(u8, u8)>,
pub cmap_bitmap: Vec<u8>,
#[serde(default)]
pub psname: Option<String>,
#[serde(default)]
pub tfname: Option<String>,
#[serde(default)]
pub lfname: Option<String>,
#[serde(default)]
pub tsname: Option<String>,
#[serde(default)]
pub lsname: Option<String>,
}
#[derive(Copy, Clone, Pod, Zeroable)]
#[repr(C)]
struct PathEntry {
font_id: u64,
mtime_secs: u64,
}
pub struct FontIndex {
env: Env,
db_metadata: Database<U64<byteorder::NativeEndian>, Bytes>,
db_inverted: Database<Bytes, Bytes>,
db_path_to_id: Database<U64<byteorder::NativeEndian>, Bytes>,
next_id: AtomicU64,
}
impl FontIndex {
pub fn open(index_dir: &Path) -> Result<Self> {
fs::create_dir_all(index_dir)
.with_context(|| format!("creating index directory {}", index_dir.display()))?;
let env = unsafe {
EnvOpenOptions::new()
.map_size(MAX_DB_SIZE)
.max_dbs(MAX_DBS)
.open(index_dir)
.with_context(|| format!("opening LMDB at {}", index_dir.display()))?
};
let mut wtxn = env.write_txn()?;
let db_metadata = env.create_database(&mut wtxn, Some("metadata"))?;
let db_inverted = env.create_database(&mut wtxn, Some("inverted"))?;
let db_path_to_id = env.create_database(&mut wtxn, Some("path_to_id"))?;
wtxn.commit()?;
let rtxn = env.read_txn()?;
let mut max_id: u64 = 0;
for result in db_metadata.iter(&rtxn)? {
let (id, _) = result?;
if id > max_id {
max_id = id;
}
}
drop(rtxn);
Ok(Self {
env,
db_metadata,
db_inverted,
db_path_to_id,
next_id: AtomicU64::new(max_id + 1),
})
}
pub fn count(&self) -> Result<usize> {
let rtxn = self.env.read_txn()?;
Ok(self.db_metadata.len(&rtxn)? as usize)
}
pub fn writer(&self) -> Result<IndexWriter<'_>> {
let wtxn = self.env.write_txn()?;
Ok(IndexWriter {
index: self,
wtxn,
modified_tags: HashSet::new(),
})
}
pub fn reader(&self) -> Result<IndexReader<'_>> {
let rtxn = self.env.read_txn()?;
Ok(IndexReader { index: self, rtxn })
}
fn alloc_id(&self) -> FontID {
self.next_id.fetch_add(1, Ordering::Relaxed)
}
}
pub struct IndexWriter<'a> {
index: &'a FontIndex,
wtxn: RwTxn<'a>,
modified_tags: HashSet<u32>,
}
impl<'a> IndexWriter<'a> {
pub fn needs_update(&self, path: &Path, mtime: SystemTime) -> Result<bool> {
let path_hash = hash_path(path);
let mtime_secs = mtime
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
if let Some(bytes) = self.index.db_path_to_id.get(&self.wtxn, &path_hash)? {
if bytes.len() == std::mem::size_of::<PathEntry>() {
let entry: PathEntry = *bytemuck::from_bytes(bytes);
return Ok(entry.mtime_secs != mtime_secs);
}
}
Ok(true) }
pub fn add_font(
&mut self,
path: &Path,
ttc_index: Option<u32>,
mtime: SystemTime,
meta: &TypgFontFaceMeta,
) -> Result<FontID> {
let path_hash = hash_path(path);
let mtime_secs = mtime
.duration_since(SystemTime::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
if let Some(bytes) = self.index.db_path_to_id.get(&self.wtxn, &path_hash)? {
if bytes.len() == std::mem::size_of::<PathEntry>() {
let entry: PathEntry = *bytemuck::from_bytes(bytes);
self.remove_font_by_id(entry.font_id)?;
}
}
let font_id = self.index.alloc_id();
let cmap_bitmap = build_cmap_bitmap(&meta.codepoints);
let meta_to_store = IndexedFontMeta {
path: path.display().to_string(),
ttc_index,
names: meta.names.clone(),
is_variable: meta.is_variable,
weight_class: meta.weight_class,
width_class: meta.width_class,
family_class: meta.family_class,
cmap_bitmap,
psname: meta.psname.clone(),
tfname: meta.tfname.clone(),
lfname: meta.lfname.clone(),
tsname: meta.tsname.clone(),
lsname: meta.lsname.clone(),
};
let meta_bytes = bincode::serialize(&meta_to_store)
.map_err(|e| anyhow::anyhow!("bincode serialize: {e}"))?;
self.index
.db_metadata
.put(&mut self.wtxn, &font_id, &meta_bytes)?;
let path_entry = PathEntry {
font_id,
mtime_secs,
};
self.index.db_path_to_id.put(
&mut self.wtxn,
&path_hash,
bytemuck::bytes_of(&path_entry),
)?;
for tag in meta
.axis_tags
.iter()
.chain(&meta.feature_tags)
.chain(&meta.script_tags)
.chain(&meta.table_tags)
{
self.add_to_inverted_index(tag_to_u32(*tag), font_id)?;
}
if meta.is_variable {
self.add_to_inverted_index(tag_marker(b"_VAR"), font_id)?;
}
Ok(font_id)
}
fn remove_font_by_id(&mut self, font_id: FontID) -> Result<()> {
self.index.db_metadata.delete(&mut self.wtxn, &font_id)?;
Ok(())
}
pub fn prune_missing(&mut self) -> Result<(usize, usize)> {
let mut to_remove = Vec::new();
let before = self.index.db_metadata.len(&self.wtxn)? as usize;
for result in self.index.db_metadata.iter(&self.wtxn)? {
let (font_id, bytes) = result?;
let meta = deserialize_meta(bytes)?;
let path = Path::new(&meta.path);
if !path.exists() {
to_remove.push(font_id);
}
}
for font_id in &to_remove {
self.index.db_metadata.delete(&mut self.wtxn, font_id)?;
}
let mut stale_hashes = Vec::new();
for result in self.index.db_path_to_id.iter(&self.wtxn)? {
let (hash, bytes) = result?;
if bytes.len() == std::mem::size_of::<PathEntry>() {
let entry: PathEntry = *bytemuck::from_bytes(bytes);
if to_remove.contains(&entry.font_id) {
stale_hashes.push(hash);
}
}
}
for hash in stale_hashes {
self.index.db_path_to_id.delete(&mut self.wtxn, &hash)?;
}
let after = self.index.db_metadata.len(&self.wtxn)? as usize;
Ok((before, after))
}
fn add_to_inverted_index(&mut self, tag: u32, font_id: FontID) -> Result<()> {
let tag_bytes = tag.to_ne_bytes();
let mut bitmap = if let Some(bytes) = self.index.db_inverted.get(&self.wtxn, &tag_bytes)? {
RoaringBitmap::deserialize_from(bytes)?
} else {
RoaringBitmap::new()
};
bitmap.insert(font_id as u32);
self.modified_tags.insert(tag);
let mut buf = Vec::new();
bitmap.serialize_into(&mut buf)?;
self.index
.db_inverted
.put(&mut self.wtxn, &tag_bytes, &buf)?;
Ok(())
}
pub fn commit(self) -> Result<()> {
self.wtxn.commit()?;
Ok(())
}
pub fn abort(self) {
self.wtxn.abort();
}
}
pub struct IndexReader<'a> {
index: &'a FontIndex,
rtxn: RoTxn<'a>,
}
impl<'a> IndexReader<'a> {
pub fn find(&self, query: &Query) -> Result<Vec<TypgFontFaceMatch>> {
let candidates = self.get_candidate_bitmap(query)?;
let ot_all = self.ot_all_bitmap(query)?;
let mut matches = Vec::new();
for font_id in candidates.iter() {
if let Some(meta) = self.get_metadata(font_id as u64)? {
if self.passes_filters(&meta, query)?
&& self.scripts_pass(font_id, &meta, query, ot_all.as_ref())?
{
matches.push(hydrate_match(&meta));
}
}
}
matches.sort_by(|a, b| {
a.source
.path
.cmp(&b.source.path)
.then_with(|| a.source.ttc_index.cmp(&b.source.ttc_index))
});
Ok(matches)
}
pub fn list_all(&self) -> Result<Vec<TypgFontFaceMatch>> {
let mut matches = Vec::new();
for result in self.index.db_metadata.iter(&self.rtxn)? {
let (_, bytes) = result?;
let meta = deserialize_meta(bytes)?;
matches.push(hydrate_match(&meta));
}
matches.sort_by(|a, b| {
a.source
.path
.cmp(&b.source.path)
.then_with(|| a.source.ttc_index.cmp(&b.source.ttc_index))
});
Ok(matches)
}
fn get_candidate_bitmap(&self, query: &Query) -> Result<RoaringBitmap> {
let mut result: Option<RoaringBitmap> = None;
for tag in query.axes() {
let bitmap = self.get_tag_bitmap(tag_to_u32(*tag))?;
result = Some(intersect_optional(result, bitmap));
}
for tag in query.features() {
let bitmap = self.get_tag_bitmap(tag_to_u32(*tag))?;
result = Some(intersect_optional(result, bitmap));
}
for tag in query.tables() {
let bitmap = self.get_tag_bitmap(tag_to_u32(*tag))?;
result = Some(intersect_optional(result, bitmap));
}
if query.requires_variable() {
let bitmap = self.get_tag_bitmap(tag_marker(b"_VAR"))?;
result = Some(intersect_optional(result, bitmap));
}
match result {
Some(bitmap) => Ok(bitmap),
None => {
let mut all = RoaringBitmap::new();
for r in self.index.db_metadata.iter(&self.rtxn)? {
let (id, _) = r?;
all.insert(id as u32);
}
Ok(all)
}
}
}
fn get_tag_bitmap(&self, tag: u32) -> Result<RoaringBitmap> {
let tag_bytes = tag.to_ne_bytes();
if let Some(bytes) = self.index.db_inverted.get(&self.rtxn, &tag_bytes)? {
Ok(RoaringBitmap::deserialize_from(bytes)?)
} else {
Ok(RoaringBitmap::new())
}
}
fn get_metadata(&self, font_id: FontID) -> Result<Option<IndexedFontMeta>> {
if let Some(bytes) = self.index.db_metadata.get(&self.rtxn, &font_id)? {
Ok(Some(deserialize_meta(bytes)?))
} else {
Ok(None)
}
}
fn ot_all_bitmap(&self, query: &Query) -> Result<Option<RoaringBitmap>> {
let reqs = query.scripts();
if reqs.is_empty() {
return Ok(None);
}
let mut acc: Option<RoaringBitmap> = None;
for req in reqs {
let mut union = RoaringBitmap::new();
for tag in req.ot_tags() {
union |= self.get_tag_bitmap(tag_to_u32(*tag))?;
}
acc = Some(match acc {
Some(prev) => prev & union,
None => union,
});
}
Ok(acc)
}
fn scripts_pass(
&self,
font_id: u32,
meta: &IndexedFontMeta,
query: &Query,
ot_all: Option<&RoaringBitmap>,
) -> Result<bool> {
let reqs = query.scripts();
if reqs.is_empty() {
return Ok(true);
}
if ot_all.map(|b| b.contains(font_id)).unwrap_or(false) {
return Ok(true);
}
if !meta.cmap_bitmap.is_empty() {
if let Ok(cmap) = RoaringBitmap::deserialize_from(meta.cmap_bitmap.as_slice()) {
let all_unicode = reqs
.iter()
.all(|req| req.unicode_satisfied(cmap.iter().filter_map(char::from_u32)));
if all_unicode {
return Ok(true);
}
}
}
Ok(false)
}
fn passes_filters(&self, meta: &IndexedFontMeta, query: &Query) -> Result<bool> {
if !query.name_patterns().is_empty() {
let matches_any = meta
.names
.iter()
.any(|name| query.name_patterns().iter().any(|p| p.is_match(name)));
if !matches_any {
return Ok(false);
}
}
if let Some(range) = query.weight_range() {
if let Some(weight) = meta.weight_class {
if !range.contains(&weight) {
return Ok(false);
}
} else {
return Ok(false);
}
}
if let Some(range) = query.width_range() {
if let Some(width) = meta.width_class {
if !range.contains(&width) {
return Ok(false);
}
} else {
return Ok(false);
}
}
if let Some(filter) = query.family_class() {
if let Some((major, sub)) = meta.family_class {
if !matches_family_class(major, sub, filter) {
return Ok(false);
}
} else {
return Ok(false);
}
}
if !query.codepoints().is_empty() && !meta.cmap_bitmap.is_empty() {
if let Ok(cmap) = RoaringBitmap::deserialize_from(meta.cmap_bitmap.as_slice()) {
for &cp in query.codepoints() {
if !cmap.contains(cp as u32) {
return Ok(false);
}
}
}
}
Ok(true)
}
}
fn deserialize_meta(bytes: &[u8]) -> Result<IndexedFontMeta> {
bincode::deserialize(bytes).map_err(|e| anyhow::anyhow!("bincode deserialize: {e}"))
}
fn hydrate_match(meta: &IndexedFontMeta) -> TypgFontFaceMatch {
use crate::search::{TypgFontFaceMeta, TypgFontSource};
TypgFontFaceMatch {
source: TypgFontSource {
path: PathBuf::from(&meta.path),
ttc_index: meta.ttc_index,
},
metadata: TypgFontFaceMeta {
names: meta.names.clone(),
axis_tags: Vec::new(), feature_tags: Vec::new(), script_tags: Vec::new(), table_tags: Vec::new(), codepoints: Vec::new(), is_variable: meta.is_variable,
weight_class: meta.weight_class,
width_class: meta.width_class,
family_class: meta.family_class,
creator_names: Vec::new(), license_names: Vec::new(), psname: meta.psname.clone(),
tfname: meta.tfname.clone(),
lfname: meta.lfname.clone(),
tsname: meta.tsname.clone(),
lsname: meta.lsname.clone(),
},
}
}
fn hash_path(path: &Path) -> u64 {
use xxhash_rust::xxh3::xxh3_64;
xxh3_64(path.to_string_lossy().as_bytes())
}
fn tag_to_u32(tag: Tag) -> u32 {
u32::from_be_bytes(tag.into_bytes())
}
fn tag_marker(name: &[u8; 4]) -> u32 {
u32::from_be_bytes(*name)
}
fn build_cmap_bitmap(codepoints: &[char]) -> Vec<u8> {
if codepoints.is_empty() {
return Vec::new();
}
let mut bitmap = RoaringBitmap::new();
for &cp in codepoints {
bitmap.insert(cp as u32);
}
let mut buf = Vec::new();
bitmap.serialize_into(&mut buf).unwrap_or_default();
buf
}
fn intersect_optional(opt: Option<RoaringBitmap>, other: RoaringBitmap) -> RoaringBitmap {
match opt {
Some(mut bm) => {
bm &= &other;
bm
}
None => other,
}
}
fn matches_family_class(major: u8, sub: u8, filter: &FamilyClassFilter) -> bool {
if major != filter.major {
return false;
}
match filter.subclass {
Some(expected_sub) => sub == expected_sub,
None => true,
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn make_mock_meta(
names: Vec<String>,
feature_tags: &[Tag],
script_tags: &[Tag],
codepoints: &[char],
is_variable: bool,
weight_class: Option<u16>,
width_class: Option<u16>,
family_class: Option<(u8, u8)>,
) -> TypgFontFaceMeta {
TypgFontFaceMeta {
names,
axis_tags: Vec::new(),
feature_tags: feature_tags.to_vec(),
script_tags: script_tags.to_vec(),
table_tags: Vec::new(),
codepoints: codepoints.to_vec(),
is_variable,
weight_class,
width_class,
family_class,
creator_names: Vec::new(),
license_names: Vec::new(),
psname: None,
tfname: None,
lfname: None,
tsname: None,
lsname: None,
}
}
#[test]
fn test_index_open_and_close() {
let dir = TempDir::new().unwrap();
let index = FontIndex::open(dir.path()).unwrap();
assert_eq!(index.count().unwrap(), 0);
}
#[test]
fn test_add_and_query_font() {
let dir = TempDir::new().unwrap();
let index = FontIndex::open(dir.path()).unwrap();
let mut writer = index.writer().unwrap();
let path = Path::new("/test/font.ttf");
let mtime = SystemTime::UNIX_EPOCH;
let meta = make_mock_meta(
vec!["Test Font".to_string()],
&[Tag::new(b"smcp")],
&[Tag::new(b"latn")],
&['a', 'b', 'c'],
false,
Some(400),
Some(5),
Some((8, 1)),
);
writer.add_font(path, None, mtime, &meta).unwrap();
writer.commit().unwrap();
assert_eq!(index.count().unwrap(), 1);
let reader = index.reader().unwrap();
let query = Query::new().with_features(vec![Tag::new(b"smcp")]);
let matches = reader.find(&query).unwrap();
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].source.path, path);
}
#[test]
fn test_incremental_update() {
let dir = TempDir::new().unwrap();
let index = FontIndex::open(dir.path()).unwrap();
let path = Path::new("/test/font.ttf");
let mtime1 = SystemTime::UNIX_EPOCH;
let mtime2 = SystemTime::UNIX_EPOCH + std::time::Duration::from_secs(1000);
{
let mut writer = index.writer().unwrap();
assert!(writer.needs_update(path, mtime1).unwrap());
let meta = make_mock_meta(
vec!["V1".to_string()],
&[],
&[],
&[],
false,
None,
None,
None,
);
writer.add_font(path, None, mtime1, &meta).unwrap();
writer.commit().unwrap();
}
{
let writer = index.writer().unwrap();
assert!(!writer.needs_update(path, mtime1).unwrap());
writer.abort();
}
{
let writer = index.writer().unwrap();
assert!(writer.needs_update(path, mtime2).unwrap());
writer.abort();
}
}
#[test]
fn test_bitmap_intersection() {
let dir = TempDir::new().unwrap();
let index = FontIndex::open(dir.path()).unwrap();
{
let mut writer = index.writer().unwrap();
let meta1 = make_mock_meta(
vec!["Font1".to_string()],
&[Tag::new(b"smcp"), Tag::new(b"liga")],
&[],
&[],
false,
None,
None,
None,
);
writer
.add_font(
Path::new("/font1.ttf"),
None,
SystemTime::UNIX_EPOCH,
&meta1,
)
.unwrap();
let meta2 = make_mock_meta(
vec!["Font2".to_string()],
&[Tag::new(b"smcp")],
&[],
&[],
false,
None,
None,
None,
);
writer
.add_font(
Path::new("/font2.ttf"),
None,
SystemTime::UNIX_EPOCH,
&meta2,
)
.unwrap();
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let q1 = Query::new().with_features(vec![Tag::new(b"smcp")]);
assert_eq!(reader.find(&q1).unwrap().len(), 2);
let q2 = Query::new().with_features(vec![Tag::new(b"smcp"), Tag::new(b"liga")]);
let matches = reader.find(&q2).unwrap();
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].source.path, Path::new("/font1.ttf"));
}
#[test]
fn test_cmap_bitmap() {
let codepoints = vec!['a', 'b', 'c', 'ñ', '中'];
let bitmap_bytes = build_cmap_bitmap(&codepoints);
let bitmap = RoaringBitmap::deserialize_from(bitmap_bytes.as_slice()).unwrap();
for &cp in &codepoints {
assert!(bitmap.contains(cp as u32));
}
assert!(!bitmap.contains('z' as u32));
}
#[test]
fn test_prune_missing() {
let dir = TempDir::new().unwrap();
let index = FontIndex::open(dir.path()).unwrap();
{
let mut writer = index.writer().unwrap();
let meta_missing = make_mock_meta(
vec!["Missing".to_string()],
&[],
&[],
&[],
false,
None,
None,
None,
);
writer
.add_font(
Path::new("/nonexistent/font.ttf"),
None,
SystemTime::UNIX_EPOCH,
&meta_missing,
)
.unwrap();
let existing_path = Path::new(env!("CARGO_MANIFEST_DIR"));
let meta_existing = make_mock_meta(
vec!["Existing".to_string()],
&[],
&[],
&[],
false,
None,
None,
None,
);
writer
.add_font(existing_path, None, SystemTime::UNIX_EPOCH, &meta_existing)
.unwrap();
writer.commit().unwrap();
}
assert_eq!(index.count().unwrap(), 2);
{
let mut writer = index.writer().unwrap();
let (before, after) = writer.prune_missing().unwrap();
writer.commit().unwrap();
assert_eq!(before, 2);
assert_eq!(after, 1);
}
assert_eq!(index.count().unwrap(), 1);
let reader = index.reader().unwrap();
let entries = reader.list_all().unwrap();
assert_eq!(entries.len(), 1);
assert!(entries[0]
.metadata
.names
.iter()
.any(|n| n.contains("Existing")));
}
}