use crate::api::wikipedia::{WikiArticle, WikiLifeHistoryFallback};
use crate::curated_animals::CURATED_ANIMAL_SPECIES;
use crate::species::UnifiedSpecies;
use rusqlite::{params, Connection, OptionalExtension};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
const SCHEMA_VERSION: i32 = 5;
const DEFAULT_CACHE_TTL_SECS: i64 = 60 * 60 * 24 * 30; const MAP_CACHE_VERSION: u32 = 3;
const APP_DATA_DIR: &str = "biodex";
const LEGACY_APP_DATA_DIR: &str = "ncbi_poketext";
const BUNDLED_SEED_DB: &[u8] = include_bytes!("../assets/biodex.seed.db");
fn curated_species_sql_filter() -> String {
let names = CURATED_ANIMAL_SPECIES
.iter()
.map(|name| format!("'{}'", name.to_ascii_lowercase().replace('\'', "''")))
.collect::<Vec<_>>()
.join(", ");
format!(
"lower(json_extract(data_json, '$.scientific_name')) IN ({})",
names
)
}
pub struct LocalDatabase {
conn: Connection,
cache_ttl_secs: i64,
}
#[derive(Debug, Clone)]
pub struct CachedSpecies {
pub species: UnifiedSpecies,
pub species_image: Option<Vec<u8>>,
pub map_image: Option<Vec<u8>>,
pub cached_at: i64,
}
#[derive(Debug, Clone, Default)]
pub struct CachedMedia {
pub species_image: Option<Vec<u8>>,
pub map_image: Option<Vec<u8>>,
}
#[derive(Debug, Clone)]
pub struct TaxonName {
pub gbif_key: u64,
pub scientific_name: String,
pub canonical_name: Option<String>,
pub rank: String,
pub kingdom: Option<String>,
pub phylum: Option<String>,
pub class: Option<String>,
pub order: Option<String>,
pub family: Option<String>,
pub genus: Option<String>,
}
#[derive(Debug, Clone, Default)]
pub struct DatabaseStats {
pub species_count: u64,
pub rich_species_count: u64,
pub taxon_names_count: u64,
pub images_count: u64,
pub total_size_bytes: u64,
}
impl LocalDatabase {
pub fn open() -> rusqlite::Result<Self> {
Self::migrate_legacy_data_dir_if_needed();
let db_path = Self::db_path();
if let Some(parent) = db_path.parent() {
fs::create_dir_all(parent).ok();
}
Self::install_bundled_seed_if_needed(&db_path);
let conn = Connection::open(&db_path)?;
Self::configure_connection(&conn, true)?;
let mut db = Self {
conn,
cache_ttl_secs: DEFAULT_CACHE_TTL_SECS,
};
db.init_schema()?;
Ok(db)
}
#[cfg(test)]
pub fn open_in_memory() -> rusqlite::Result<Self> {
let conn = Connection::open_in_memory()?;
Self::configure_connection(&conn, false)?;
let mut db = Self {
conn,
cache_ttl_secs: DEFAULT_CACHE_TTL_SECS,
};
db.init_schema()?;
Ok(db)
}
fn db_path() -> PathBuf {
Self::data_root_dir().join("species_cache.db")
}
fn legacy_db_path() -> PathBuf {
Self::legacy_data_root_dir().join("species_cache.db")
}
fn data_root_dir() -> PathBuf {
dirs::data_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join(APP_DATA_DIR)
}
fn legacy_data_root_dir() -> PathBuf {
dirs::data_dir()
.unwrap_or_else(|| PathBuf::from("."))
.join(LEGACY_APP_DATA_DIR)
}
fn migrate_legacy_data_dir_if_needed() {
let current_root = Self::data_root_dir();
let legacy_root = Self::legacy_data_root_dir();
let current_db = Self::db_path();
let legacy_db = Self::legacy_db_path();
if !legacy_db.exists() {
return;
}
if Self::db_has_cached_content(¤t_db) || !Self::db_has_cached_content(&legacy_db) {
return;
}
if current_root.exists() && !Self::dir_can_be_replaced(¤t_root, ¤t_db) {
return;
}
if current_root.exists() {
let _ = std::fs::remove_dir_all(¤t_root);
}
if let Some(parent) = current_root.parent() {
let _ = std::fs::create_dir_all(parent);
}
if std::fs::rename(&legacy_root, ¤t_root).is_ok() {
return;
}
let _ = std::fs::create_dir_all(¤t_root);
let _ = std::fs::copy(&legacy_db, ¤t_db);
for suffix in ["-wal", "-shm"] {
let legacy_sidecar = PathBuf::from(format!("{}{}", legacy_db.display(), suffix));
let current_sidecar = PathBuf::from(format!("{}{}", current_db.display(), suffix));
if legacy_sidecar.exists() {
let _ = std::fs::copy(legacy_sidecar, current_sidecar);
}
}
}
fn dir_can_be_replaced(dir: &Path, current_db: &Path) -> bool {
let Ok(entries) = std::fs::read_dir(dir) else {
return false;
};
for entry in entries.flatten() {
let path = entry.path();
if path == current_db {
continue;
}
return false;
}
true
}
fn db_has_cached_content(path: &Path) -> bool {
if !path.exists() {
return false;
}
let Ok(conn) = Connection::open(path) else {
return false;
};
for table in ["species", "rich_species", "taxon_names", "images"] {
let sql = format!("SELECT COUNT(*) FROM {table}");
let count: rusqlite::Result<i64> = conn.query_row(&sql, [], |row| row.get(0));
if count.unwrap_or(0) > 0 {
return true;
}
}
false
}
fn install_bundled_seed_if_needed(db_path: &Path) {
if db_path.exists() || BUNDLED_SEED_DB.is_empty() {
return;
}
let Some(parent) = db_path.parent() else {
return;
};
if fs::create_dir_all(parent).is_err() {
return;
}
let temp_path = parent.join("species_cache.seed.tmp");
if fs::write(&temp_path, BUNDLED_SEED_DB).is_err() {
let _ = fs::remove_file(&temp_path);
return;
}
if fs::rename(&temp_path, db_path).is_err() {
let _ = fs::remove_file(&temp_path);
}
}
fn configure_connection(conn: &Connection, enable_wal: bool) -> rusqlite::Result<()> {
conn.busy_timeout(Duration::from_secs(2))?;
if enable_wal {
conn.execute_batch(
"PRAGMA journal_mode = WAL;
PRAGMA synchronous = NORMAL;
PRAGMA temp_store = MEMORY;
PRAGMA cache_size = -8192;",
)?;
} else {
conn.execute_batch(
"PRAGMA synchronous = NORMAL;
PRAGMA temp_store = MEMORY;
PRAGMA cache_size = -4096;",
)?;
}
Ok(())
}
fn init_schema(&mut self) -> rusqlite::Result<()> {
let version: i32 = self
.conn
.query_row("PRAGMA user_version", [], |row| row.get(0))
.unwrap_or(0);
if version < SCHEMA_VERSION {
self.create_tables()?;
self.conn
.execute(&format!("PRAGMA user_version = {}", SCHEMA_VERSION), [])?;
}
self.cleanup_legacy_map_cache()?;
Ok(())
}
fn create_tables(&self) -> rusqlite::Result<()> {
self.conn.execute(
"CREATE TABLE IF NOT EXISTS species (
scientific_name TEXT PRIMARY KEY,
data_json TEXT NOT NULL,
cached_at INTEGER NOT NULL,
last_accessed INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS images (
url TEXT PRIMARY KEY,
image_data BLOB NOT NULL,
content_type TEXT,
cached_at INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS wiki_articles (
title TEXT PRIMARY KEY,
extract TEXT NOT NULL,
wikitext TEXT NOT NULL,
cached_at INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS wiki_life_history (
title TEXT PRIMARY KEY,
data_json TEXT NOT NULL,
cached_at INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS rich_species (
scientific_name TEXT PRIMARY KEY,
data_json TEXT NOT NULL,
enriched_at INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS taxon_names (
gbif_key INTEGER PRIMARY KEY,
scientific_name TEXT NOT NULL,
canonical_name TEXT,
rank TEXT NOT NULL,
kingdom TEXT,
phylum TEXT,
class TEXT,
order_name TEXT,
family TEXT,
genus TEXT
)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_scientific ON taxon_names(scientific_name COLLATE NOCASE)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_canonical ON taxon_names(canonical_name COLLATE NOCASE)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_rank ON taxon_names(rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_genus ON taxon_names(genus, rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_family ON taxon_names(family, rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_order ON taxon_names(order_name, rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_class ON taxon_names(class, rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_phylum ON taxon_names(phylum, rank)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_taxon_kingdom ON taxon_names(kingdom, rank)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS ncbi_taxonomy (
tax_id INTEGER PRIMARY KEY,
scientific_name TEXT NOT NULL,
common_name TEXT,
rank TEXT
)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_ncbi_scientific ON ncbi_taxonomy(scientific_name COLLATE NOCASE)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_ncbi_common ON ncbi_taxonomy(common_name COLLATE NOCASE)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS user_stats (
key TEXT PRIMARY KEY,
value TEXT NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS view_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
scientific_name TEXT NOT NULL,
viewed_at INTEGER NOT NULL
)",
[],
)?;
self.conn.execute(
"CREATE INDEX IF NOT EXISTS idx_history_time ON view_history(viewed_at DESC)",
[],
)?;
self.conn.execute(
"CREATE TABLE IF NOT EXISTS favorites (
scientific_name TEXT PRIMARY KEY,
added_at INTEGER NOT NULL,
notes TEXT
)",
[],
)?;
Ok(())
}
fn current_timestamp() -> i64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0)
}
pub fn get_species(&self, scientific_name: &str) -> rusqlite::Result<Option<CachedSpecies>> {
let now = Self::current_timestamp();
let cutoff = now - self.cache_ttl_secs;
let result: Option<(String, i64)> = self
.conn
.query_row(
"SELECT data_json, cached_at FROM species
WHERE scientific_name = ?1 AND cached_at > ?2",
params![scientific_name.to_lowercase(), cutoff],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.optional()?;
if let Some((json, cached_at)) = result {
self.conn.execute(
"UPDATE species SET last_accessed = ?1 WHERE scientific_name = ?2",
params![now, scientific_name.to_lowercase()],
)?;
if let Ok(species) = serde_json::from_str::<UnifiedSpecies>(&json) {
let species_image = self.get_species_image(&species)?;
let map_image = self.get_map_image(&species)?;
return Ok(Some(CachedSpecies {
species,
species_image,
map_image,
cached_at,
}));
}
}
Ok(None)
}
pub fn cache_species(&self, species: &UnifiedSpecies) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
let json = serde_json::to_string(species).unwrap_or_default();
self.conn.execute(
"INSERT OR REPLACE INTO species (scientific_name, data_json, cached_at, last_accessed)
VALUES (?1, ?2, ?3, ?3)",
params![species.scientific_name.to_lowercase(), json, now],
)?;
self.conn.execute(
"INSERT INTO view_history (scientific_name, viewed_at) VALUES (?1, ?2)",
params![species.scientific_name, now],
)?;
Ok(())
}
pub fn get_rich_species(
&self,
scientific_name: &str,
) -> rusqlite::Result<Option<UnifiedSpecies>> {
let data_json: Option<String> = self
.conn
.query_row(
"SELECT data_json FROM rich_species WHERE scientific_name = ?1",
params![scientific_name.to_lowercase()],
|row| row.get(0),
)
.optional()?;
Ok(data_json.and_then(|json| serde_json::from_str(&json).ok()))
}
pub fn cache_rich_species(&self, species: &UnifiedSpecies) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
let json = serde_json::to_string(species).unwrap_or_default();
self.conn.execute(
"INSERT OR REPLACE INTO rich_species (scientific_name, data_json, enriched_at)
VALUES (?1, ?2, ?3)",
params![species.scientific_name.to_lowercase(), json, now],
)?;
Ok(())
}
fn get_species_image(&self, species: &UnifiedSpecies) -> rusqlite::Result<Option<Vec<u8>>> {
if let Some(url) = species.preferred_image_url() {
self.get_image(url)
} else {
Ok(None)
}
}
fn get_map_image(&self, species: &UnifiedSpecies) -> rusqlite::Result<Option<Vec<u8>>> {
if let Some(gbif_key) = species.ids.gbif_key {
self.get_map_image_by_key(gbif_key)
} else {
Ok(None)
}
}
pub fn get_cached_media(
&self,
species_image_url: Option<&str>,
gbif_key: Option<u64>,
) -> rusqlite::Result<CachedMedia> {
let species_image = match species_image_url {
Some(url) => self.get_image(url)?,
None => None,
};
let map_image = match gbif_key {
Some(key) => self.get_map_image_by_key(key)?,
None => None,
};
Ok(CachedMedia {
species_image,
map_image,
})
}
fn get_map_image_by_key(&self, gbif_key: u64) -> rusqlite::Result<Option<Vec<u8>>> {
let url = Self::map_cache_key(gbif_key);
if let Some(image) = self.get_image(&url)? {
return Ok(Some(image));
}
self.delete_image(&Self::legacy_map_cache_key(gbif_key))?;
Ok(None)
}
pub fn get_image(&self, url: &str) -> rusqlite::Result<Option<Vec<u8>>> {
self.conn
.query_row(
"SELECT image_data FROM images WHERE url = ?1",
params![url],
|row| row.get(0),
)
.optional()
}
pub fn cache_image(
&self,
url: &str,
data: &[u8],
content_type: Option<&str>,
) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
self.conn.execute(
"INSERT OR REPLACE INTO images (url, image_data, content_type, cached_at)
VALUES (?1, ?2, ?3, ?4)",
params![url, data, content_type, now],
)?;
Ok(())
}
pub fn delete_image(&self, url: &str) -> rusqlite::Result<()> {
self.conn
.execute("DELETE FROM images WHERE url = ?1", params![url])?;
Ok(())
}
pub fn cache_species_image(
&self,
species: &UnifiedSpecies,
data: &[u8],
) -> rusqlite::Result<()> {
if let Some(url) = species.preferred_image_url() {
self.cache_image(url, data, Some("image/jpeg"))?;
}
Ok(())
}
pub fn cache_map_image(&self, gbif_key: u64, data: &[u8]) -> rusqlite::Result<()> {
let url = Self::map_cache_key(gbif_key);
self.cache_image(&url, data, Some("image/png"))
}
pub fn invalidate_map_image(&self, gbif_key: u64) -> rusqlite::Result<()> {
self.delete_image(&Self::map_cache_key(gbif_key))?;
self.delete_image(&Self::legacy_map_cache_key(gbif_key))?;
Ok(())
}
pub fn get_wiki_article(&self, title: &str) -> rusqlite::Result<Option<WikiArticle>> {
let cutoff = Self::current_timestamp() - self.cache_ttl_secs;
self.conn
.query_row(
"SELECT extract, wikitext FROM wiki_articles
WHERE title = ?1 AND cached_at > ?2",
params![title.to_lowercase(), cutoff],
|row| {
Ok(WikiArticle {
extract: row.get(0)?,
wikitext: row.get(1)?,
})
},
)
.optional()
}
pub fn cache_wiki_article(&self, title: &str, article: &WikiArticle) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
self.conn.execute(
"INSERT OR REPLACE INTO wiki_articles (title, extract, wikitext, cached_at)
VALUES (?1, ?2, ?3, ?4)",
params![
title.to_lowercase(),
&article.extract,
&article.wikitext,
now
],
)?;
Ok(())
}
pub fn get_wiki_life_history(
&self,
title: &str,
) -> rusqlite::Result<Option<WikiLifeHistoryFallback>> {
let cutoff = Self::current_timestamp() - self.cache_ttl_secs;
let data_json: Option<String> = self
.conn
.query_row(
"SELECT data_json FROM wiki_life_history
WHERE title = ?1 AND cached_at > ?2",
params![title.to_lowercase(), cutoff],
|row| row.get(0),
)
.optional()?;
Ok(data_json.and_then(|json| serde_json::from_str(&json).ok()))
}
pub fn cache_wiki_life_history(
&self,
title: &str,
fallback: &WikiLifeHistoryFallback,
) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
let data_json = serde_json::to_string(fallback).unwrap_or_default();
self.conn.execute(
"INSERT OR REPLACE INTO wiki_life_history (title, data_json, cached_at)
VALUES (?1, ?2, ?3)",
params![title.to_lowercase(), data_json, now],
)?;
Ok(())
}
fn map_cache_key(gbif_key: u64) -> String {
format!("gbif_map_v{}_{}", MAP_CACHE_VERSION, gbif_key)
}
fn legacy_map_cache_key(gbif_key: u64) -> String {
format!("gbif_map_{}", gbif_key)
}
fn cleanup_legacy_map_cache(&self) -> rusqlite::Result<()> {
self.conn.execute(
"DELETE FROM images
WHERE url GLOB 'gbif_map_[0-9]*'
AND url NOT GLOB 'gbif_map_v*'",
[],
)?;
Ok(())
}
pub fn search_taxon_names(&self, query: &str, limit: u32) -> rusqlite::Result<Vec<TaxonName>> {
let normalized = query.trim().to_ascii_lowercase();
if normalized.is_empty() {
return Ok(Vec::new());
}
let sql = format!(
"SELECT data_json FROM rich_species WHERE {}",
curated_species_sql_filter()
);
let mut stmt = self.conn.prepare(&sql)?;
let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
let mut matches: HashMap<(String, String), (u8, usize, TaxonName)> = HashMap::new();
let push_match = |matches: &mut HashMap<(String, String), (u8, usize, TaxonName)>,
name: String,
rank: &str,
priority: u8,
taxonomy: &crate::species::Taxonomy,
scientific_name: &str| {
let key = (name.to_ascii_lowercase(), rank.to_ascii_uppercase());
let candidate = TaxonName {
gbif_key: 0,
scientific_name: name.clone(),
canonical_name: Some(name.clone()),
rank: rank.to_ascii_uppercase(),
kingdom: taxonomy.kingdom.clone(),
phylum: taxonomy.phylum.clone(),
class: taxonomy.class.clone(),
order: taxonomy.order.clone(),
family: taxonomy.family.clone(),
genus: taxonomy.genus.clone(),
};
let length = scientific_name.len().min(name.len());
match matches.get(&key) {
Some((existing_priority, existing_length, _))
if (*existing_priority, *existing_length) <= (priority, length) => {}
_ => {
matches.insert(key, (priority, length, candidate));
}
}
};
for row in rows {
let json = row?;
let species: UnifiedSpecies = match serde_json::from_str(&json) {
Ok(species) => species,
Err(_) => continue,
};
let scientific = species.scientific_name.clone();
let scientific_lower = scientific.to_ascii_lowercase();
let common_match = species
.common_names
.iter()
.any(|name| name.to_ascii_lowercase().contains(&normalized));
let scientific_match = scientific_lower.contains(&normalized);
if scientific_match || common_match {
let priority = if scientific_lower.starts_with(&normalized) {
0
} else if species
.common_names
.iter()
.any(|name| name.to_ascii_lowercase().starts_with(&normalized))
{
1
} else {
2
};
push_match(
&mut matches,
scientific.clone(),
"SPECIES",
priority,
&species.taxonomy,
&scientific,
);
}
for (rank, value) in [
("KINGDOM", species.taxonomy.kingdom.as_deref()),
("PHYLUM", species.taxonomy.phylum.as_deref()),
("CLASS", species.taxonomy.class.as_deref()),
("ORDER", species.taxonomy.order.as_deref()),
("FAMILY", species.taxonomy.family.as_deref()),
("GENUS", species.taxonomy.genus.as_deref()),
] {
let Some(value) = value else {
continue;
};
let lower = value.to_ascii_lowercase();
if lower.contains(&normalized) {
let priority = if lower.starts_with(&normalized) { 0 } else { 2 };
push_match(
&mut matches,
value.to_string(),
rank,
priority,
&species.taxonomy,
&scientific,
);
}
}
}
let mut results = matches.into_values().collect::<Vec<_>>();
results.sort_by(|a, b| {
a.0.cmp(&b.0)
.then(a.1.cmp(&b.1))
.then_with(|| a.2.scientific_name.cmp(&b.2.scientific_name))
.then_with(|| a.2.rank.cmp(&b.2.rank))
});
results.truncate(limit as usize);
Ok(results.into_iter().map(|(_, _, taxon)| taxon).collect())
}
pub fn taxon_names_count(&self) -> rusqlite::Result<u64> {
self.conn
.query_row("SELECT COUNT(*) FROM taxon_names", [], |row| row.get(0))
}
pub fn species_rank_count(&self) -> rusqlite::Result<u64> {
self.conn.query_row(
"SELECT COUNT(*) FROM taxon_names WHERE rank = 'SPECIES'",
[],
|row| row.get(0),
)
}
pub fn insert_taxon_names_batch(&self, names: &[TaxonName]) -> rusqlite::Result<()> {
let tx = self.conn.unchecked_transaction()?;
{
let mut stmt = tx.prepare(
"INSERT OR IGNORE INTO taxon_names
(gbif_key, scientific_name, canonical_name, rank, kingdom, phylum, class, order_name, family, genus)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10)"
)?;
for name in names {
stmt.execute(params![
name.gbif_key,
name.scientific_name,
name.canonical_name,
name.rank,
name.kingdom,
name.phylum,
name.class,
name.order,
name.family,
name.genus,
])?;
}
}
tx.commit()
}
pub fn has_backbone(&self) -> bool {
self.taxon_names_count().unwrap_or(0) > 100_000
}
pub fn has_ncbi_taxonomy(&self) -> bool {
self.ncbi_taxonomy_count().unwrap_or(0) > 100_000
}
pub fn ncbi_taxonomy_count(&self) -> rusqlite::Result<u64> {
self.conn
.query_row("SELECT COUNT(*) FROM ncbi_taxonomy", [], |row| row.get(0))
}
pub fn insert_ncbi_taxonomy_batch(
&self,
entries: &[crate::bulk_import::NcbiTaxonEntry],
) -> rusqlite::Result<()> {
let tx = self.conn.unchecked_transaction()?;
{
let mut stmt = tx.prepare(
"INSERT OR REPLACE INTO ncbi_taxonomy (tax_id, scientific_name, common_name, rank)
VALUES (?1, ?2, ?3, ?4)",
)?;
for entry in entries {
stmt.execute(params![
entry.tax_id,
entry.scientific_name,
entry.common_name,
entry.rank,
])?;
}
}
tx.commit()
}
pub fn get_taxon_by_name(&self, name: &str) -> rusqlite::Result<Option<TaxonName>> {
self.conn
.query_row(
"SELECT gbif_key, scientific_name, canonical_name, rank,
kingdom, phylum, class, order_name, family, genus
FROM taxon_names
WHERE scientific_name = ?1 COLLATE NOCASE
OR canonical_name = ?1 COLLATE NOCASE
LIMIT 1",
params![name],
|row| {
Ok(TaxonName {
gbif_key: row.get(0)?,
scientific_name: row.get(1)?,
canonical_name: row.get(2)?,
rank: row.get(3)?,
kingdom: row.get(4)?,
phylum: row.get(5)?,
class: row.get(6)?,
order: row.get(7)?,
family: row.get(8)?,
genus: row.get(9)?,
})
},
)
.optional()
}
pub fn get_species_batch_after(
&self,
after_gbif_key: u64,
limit: u32,
) -> rusqlite::Result<Vec<TaxonName>> {
let mut stmt = self.conn.prepare(
"SELECT gbif_key, scientific_name, canonical_name, rank,
kingdom, phylum, class, order_name, family, genus
FROM taxon_names
WHERE rank = 'SPECIES'
AND gbif_key > ?1
AND (EXISTS (SELECT 1 FROM rich_species rs WHERE rs.scientific_name = IFNULL(taxon_names.canonical_name, taxon_names.scientific_name) COLLATE NOCASE)
OR EXISTS (SELECT 1 FROM species sp WHERE sp.scientific_name = IFNULL(taxon_names.canonical_name, taxon_names.scientific_name) COLLATE NOCASE))
ORDER BY gbif_key
LIMIT ?2",
)?;
let rows = stmt.query_map(params![after_gbif_key, limit], |row| {
Ok(TaxonName {
gbif_key: row.get(0)?,
scientific_name: row.get(1)?,
canonical_name: row.get(2)?,
rank: row.get(3)?,
kingdom: row.get(4)?,
phylum: row.get(5)?,
class: row.get(6)?,
order: row.get(7)?,
family: row.get(8)?,
genus: row.get(9)?,
})
})?;
rows.collect()
}
pub fn get_cached_species_names(&self, limit: u32) -> rusqlite::Result<Vec<String>> {
let mut stmt = self.conn.prepare(&format!(
"SELECT json_extract(data_json, '$.scientific_name') AS scientific_name
FROM rich_species
WHERE lower(json_extract(data_json, '$.rank')) = 'species'
AND {}
AND COALESCE(json_extract(data_json, '$.description'), '') != 'Locally materialized from the offline taxonomy cache.'
ORDER BY scientific_name COLLATE NOCASE
LIMIT ?1",
curated_species_sql_filter()
))?;
let rows = stmt.query_map(params![limit], |row| row.get::<_, String>(0))?;
rows.collect()
}
pub fn get_cached_kingdoms(&self) -> rusqlite::Result<Vec<String>> {
let mut stmt = self.conn.prepare(
&format!(
"SELECT DISTINCT json_extract(data_json, '$.taxonomy.kingdom')\n FROM rich_species\n WHERE json_extract(data_json, '$.taxonomy.kingdom') IS NOT NULL\n AND {}\n ORDER BY 1 COLLATE NOCASE",
curated_species_sql_filter()
),
)?;
let rows = stmt.query_map([], |row| row.get::<_, String>(0))?;
rows.collect()
}
pub fn get_cached_parent_taxon(
&self,
child_rank: &str,
child_value: &str,
) -> rusqlite::Result<Option<(String, String)>> {
let parent_rank = match child_rank.to_ascii_uppercase().as_str() {
"GENUS" => "FAMILY",
"FAMILY" => "ORDER",
"ORDER" => "CLASS",
"CLASS" => "PHYLUM",
"PHYLUM" => "KINGDOM",
_ => return Ok(None),
};
let child_path = match child_rank.to_ascii_uppercase().as_str() {
"KINGDOM" => "$.taxonomy.kingdom",
"PHYLUM" => "$.taxonomy.phylum",
"CLASS" => "$.taxonomy.class",
"ORDER" => "$.taxonomy.order",
"FAMILY" => "$.taxonomy.family",
"GENUS" => "$.taxonomy.genus",
_ => return Ok(None),
};
let parent_path = match parent_rank {
"KINGDOM" => "$.taxonomy.kingdom",
"PHYLUM" => "$.taxonomy.phylum",
"CLASS" => "$.taxonomy.class",
"ORDER" => "$.taxonomy.order",
"FAMILY" => "$.taxonomy.family",
_ => return Ok(None),
};
let sql = format!(
"SELECT DISTINCT json_extract(data_json, '{}')
FROM rich_species
WHERE json_extract(data_json, '{}') = ?1
AND json_extract(data_json, '{}') IS NOT NULL
AND {}
LIMIT 1",
parent_path,
child_path,
parent_path,
curated_species_sql_filter()
);
self.conn
.query_row(&sql, params![child_value], |row| row.get::<_, String>(0))
.optional()
.map(|value| value.map(|name| (name, parent_rank.to_string())))
}
pub fn get_siblings(
&self,
parent_rank: &str,
parent_value: &str,
child_rank: &str,
limit: u32,
) -> rusqlite::Result<Vec<TaxonName>> {
let parent_path = match parent_rank.to_ascii_uppercase().as_str() {
"KINGDOM" => "$.taxonomy.kingdom",
"PHYLUM" => "$.taxonomy.phylum",
"CLASS" => "$.taxonomy.class",
"ORDER" => "$.taxonomy.order",
"FAMILY" => "$.taxonomy.family",
"GENUS" => "$.taxonomy.genus",
_ => return Ok(Vec::new()),
};
let target_rank = if child_rank.is_empty() {
match parent_rank.to_ascii_uppercase().as_str() {
"KINGDOM" => "PHYLUM",
"PHYLUM" => "CLASS",
"CLASS" => "ORDER",
"ORDER" => "FAMILY",
"FAMILY" => "GENUS",
"GENUS" => "SPECIES",
_ => return Ok(Vec::new()),
}
} else {
child_rank
};
let child_path = match target_rank.to_ascii_uppercase().as_str() {
"KINGDOM" => "$.taxonomy.kingdom",
"PHYLUM" => "$.taxonomy.phylum",
"CLASS" => "$.taxonomy.class",
"ORDER" => "$.taxonomy.order",
"FAMILY" => "$.taxonomy.family",
"GENUS" => "$.taxonomy.genus",
"SPECIES" => "$.scientific_name",
_ => return Ok(Vec::new()),
};
let sql = format!(
"SELECT DISTINCT json_extract(data_json, '{child_path}')
FROM rich_species
WHERE json_extract(data_json, '{parent_path}') = ?1
AND json_extract(data_json, '{child_path}') IS NOT NULL
AND {}
ORDER BY 1 COLLATE NOCASE
LIMIT ?2",
curated_species_sql_filter()
);
let mut stmt = self.conn.prepare(&sql)?;
let rows = stmt.query_map(params![parent_value, limit], |row| {
let name: String = row.get(0)?;
Ok(TaxonName {
gbif_key: 0,
scientific_name: name.clone(),
canonical_name: Some(name),
rank: target_rank.to_ascii_uppercase(),
kingdom: None,
phylum: None,
class: None,
order: None,
family: None,
genus: None,
})
})?;
rows.collect()
}
pub fn get_species_in_genus(
&self,
genus: &str,
limit: u32,
) -> rusqlite::Result<Vec<TaxonName>> {
self.get_siblings("GENUS", genus, "SPECIES", limit)
}
pub fn get_genera_in_family(
&self,
family: &str,
limit: u32,
) -> rusqlite::Result<Vec<TaxonName>> {
self.get_siblings("FAMILY", family, "GENUS", limit)
}
pub fn get_stats(&self) -> rusqlite::Result<DatabaseStats> {
let species_count: u64 =
self.conn
.query_row("SELECT COUNT(*) FROM species", [], |row| row.get(0))?;
let rich_species_count: u64 =
self.conn
.query_row("SELECT COUNT(*) FROM rich_species", [], |row| row.get(0))?;
let taxon_names_count: u64 =
self.conn
.query_row("SELECT COUNT(*) FROM taxon_names", [], |row| row.get(0))?;
let images_count: u64 = self
.conn
.query_row("SELECT COUNT(*) FROM images", [], |row| row.get(0))?;
let db_path = Self::db_path();
let total_size_bytes = std::fs::metadata(&db_path).map(|m| m.len()).unwrap_or(0);
Ok(DatabaseStats {
species_count,
rich_species_count,
taxon_names_count,
images_count,
total_size_bytes,
})
}
pub fn get_user_stat(&self, key: &str) -> rusqlite::Result<Option<String>> {
self.conn
.query_row(
"SELECT value FROM user_stats WHERE key = ?1",
params![key],
|row| row.get(0),
)
.optional()
}
pub fn set_user_stat(&self, key: &str, value: &str) -> rusqlite::Result<()> {
self.conn.execute(
"INSERT OR REPLACE INTO user_stats (key, value) VALUES (?1, ?2)",
params![key, value],
)?;
Ok(())
}
pub fn delete_user_stat(&self, key: &str) -> rusqlite::Result<()> {
self.conn
.execute("DELETE FROM user_stats WHERE key = ?1", params![key])?;
Ok(())
}
pub fn get_recent_history(&self, limit: u32) -> rusqlite::Result<Vec<(String, i64)>> {
let mut stmt = self.conn.prepare(
"SELECT DISTINCT scientific_name, MAX(viewed_at) as last_viewed
FROM view_history
GROUP BY scientific_name
ORDER BY last_viewed DESC
LIMIT ?1",
)?;
let rows = stmt.query_map(params![limit], |row| Ok((row.get(0)?, row.get(1)?)))?;
rows.collect()
}
pub fn add_favorite(&self, scientific_name: &str, notes: Option<&str>) -> rusqlite::Result<()> {
let now = Self::current_timestamp();
self.conn.execute(
"INSERT OR REPLACE INTO favorites (scientific_name, added_at, notes)
VALUES (?1, ?2, ?3)",
params![scientific_name, now, notes],
)?;
Ok(())
}
pub fn remove_favorite(&self, scientific_name: &str) -> rusqlite::Result<()> {
self.conn.execute(
"DELETE FROM favorites WHERE scientific_name = ?1",
params![scientific_name],
)?;
Ok(())
}
pub fn is_favorite(&self, scientific_name: &str) -> rusqlite::Result<bool> {
let count: u64 = self.conn.query_row(
"SELECT COUNT(*) FROM favorites WHERE scientific_name = ?1",
params![scientific_name],
|row| row.get(0),
)?;
Ok(count > 0)
}
pub fn get_favorites(&self) -> rusqlite::Result<Vec<(String, i64, Option<String>)>> {
let mut stmt = self.conn.prepare(
"SELECT scientific_name, added_at, notes FROM favorites ORDER BY added_at DESC",
)?;
let rows = stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)))?;
rows.collect()
}
pub fn invalidate_species(&self, scientific_name: &str) -> rusqlite::Result<()> {
self.conn.execute(
"DELETE FROM species WHERE scientific_name = ?1",
params![scientific_name.to_lowercase()],
)?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn test_database_creation() {
let db = LocalDatabase::open_in_memory().unwrap();
let stats = db.get_stats().unwrap();
assert_eq!(stats.species_count, 0);
}
#[test]
fn installs_bundled_seed_into_missing_database_path() {
let unique = format!(
"biodex_seed_test_{}_{}",
std::process::id(),
LocalDatabase::current_timestamp()
);
let root = std::env::temp_dir().join(unique);
let db_path = root.join("species_cache.db");
if root.exists() {
let _ = fs::remove_dir_all(&root);
}
fs::create_dir_all(&root).unwrap();
LocalDatabase::install_bundled_seed_if_needed(&db_path);
assert!(db_path.exists());
let conn = Connection::open(&db_path).unwrap();
let species_count: i64 = conn
.query_row("SELECT COUNT(*) FROM species", [], |row| row.get(0))
.unwrap();
let rich_species_count: i64 = conn
.query_row("SELECT COUNT(*) FROM rich_species", [], |row| row.get(0))
.unwrap();
let hot_seed_version: String = conn
.query_row(
"SELECT value FROM user_stats WHERE key = 'hot_seed.version'",
[],
|row| row.get(0),
)
.unwrap();
assert_eq!(species_count, CURATED_ANIMAL_SPECIES.len() as i64);
assert_eq!(rich_species_count, CURATED_ANIMAL_SPECIES.len() as i64);
assert_eq!(hot_seed_version, "1");
let _ = fs::remove_dir_all(&root);
}
#[test]
fn test_map_cache_key_is_versioned() {
assert_eq!(LocalDatabase::map_cache_key(5219408), "gbif_map_v3_5219408");
}
#[test]
fn test_legacy_map_cache_key() {
assert_eq!(
LocalDatabase::legacy_map_cache_key(5219408),
"gbif_map_5219408"
);
}
#[test]
fn gets_cached_media_without_loading_species_json() {
let db = LocalDatabase::open_in_memory().unwrap();
db.cache_image(
"https://example.com/inat.jpg",
b"species-bytes",
Some("image/jpeg"),
)
.unwrap();
db.cache_map_image(42, b"map-bytes").unwrap();
let media = db
.get_cached_media(Some("https://example.com/inat.jpg"), Some(42))
.unwrap();
assert_eq!(media.species_image, Some(b"species-bytes".to_vec()));
assert_eq!(media.map_image, Some(b"map-bytes".to_vec()));
}
#[test]
fn caches_wiki_article_and_life_history() {
let db = LocalDatabase::open_in_memory().unwrap();
let article = WikiArticle {
extract: "Lion extract".to_string(),
wikitext: "Lion wikitext".to_string(),
};
let fallback = WikiLifeHistoryFallback {
lifespan_years: Some(20.0),
length_meters: Some(1.8),
height_meters: None,
mass_kilograms: Some(188.0),
reproduction_modes: vec!["Sexual".to_string()],
};
db.cache_wiki_article("Panthera leo", &article).unwrap();
db.cache_wiki_life_history("Panthera leo", &fallback)
.unwrap();
assert_eq!(
db.get_wiki_article("Panthera leo")
.unwrap()
.unwrap()
.extract,
"Lion extract"
);
assert_eq!(
db.get_wiki_life_history("Panthera leo")
.unwrap()
.unwrap()
.lifespan_years,
Some(20.0)
);
}
#[test]
fn caches_rich_species_without_ttl_gate() {
let db = LocalDatabase::open_in_memory().unwrap();
let species = UnifiedSpecies {
scientific_name: "Panthera leo".to_string(),
common_names: vec!["Lion".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: Some("Big cat".to_string()),
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
db.cache_rich_species(&species).unwrap();
let cached = db.get_rich_species("Panthera leo").unwrap().unwrap();
assert_eq!(cached.scientific_name, "Panthera leo");
assert_eq!(cached.common_names, vec!["Lion"]);
}
#[test]
fn search_taxon_names_stays_within_curated_species() {
let db = LocalDatabase::open_in_memory().unwrap();
let curated = UnifiedSpecies {
scientific_name: "Panthera leo".to_string(),
common_names: vec!["Lion".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy {
kingdom: Some("Animalia".to_string()),
phylum: Some("Chordata".to_string()),
class: Some("Mammalia".to_string()),
order: Some("Carnivora".to_string()),
family: Some("Felidae".to_string()),
genus: Some("Panthera".to_string()),
division: None,
lineage: Vec::new(),
},
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: Some("Big cat".to_string()),
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
let non_curated = UnifiedSpecies {
scientific_name: "Dicentrarchus labrax".to_string(),
common_names: vec!["European Seabass".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy {
kingdom: Some("Animalia".to_string()),
phylum: Some("Chordata".to_string()),
class: Some("Actinopteri".to_string()),
order: Some("Perciformes".to_string()),
family: Some("Moronidae".to_string()),
genus: Some("Dicentrarchus".to_string()),
division: None,
lineage: Vec::new(),
},
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: Some("Seabass".to_string()),
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
db.cache_rich_species(&curated).unwrap();
db.cache_rich_species(&non_curated).unwrap();
let panthera_hits = db.search_taxon_names("panthera", 20).unwrap();
assert!(panthera_hits
.iter()
.any(|row| row.scientific_name == "Panthera leo"));
assert!(panthera_hits
.iter()
.any(|row| row.scientific_name == "Panthera" && row.rank == "GENUS"));
let seabass_hits = db.search_taxon_names("seabass", 20).unwrap();
assert!(seabass_hits.is_empty());
}
#[test]
fn get_cached_species_names_returns_only_curated_rich_species_with_data() {
let db = LocalDatabase::open_in_memory().unwrap();
let lion = UnifiedSpecies {
scientific_name: "Panthera leo".to_string(),
common_names: vec!["Lion".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: None,
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
let wolf = UnifiedSpecies {
scientific_name: "Canis lupus".to_string(),
common_names: vec!["Wolf".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: None,
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
let tax_only_cat = UnifiedSpecies {
scientific_name: "Felis catus".to_string(),
common_names: vec!["Cat".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: Some("Locally materialized from the offline taxonomy cache.".to_string()),
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
let non_curated = UnifiedSpecies {
scientific_name: "Dicentrarchus labrax".to_string(),
common_names: vec!["European seabass".to_string()],
rank: "species".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: Some("Non-curated test species".to_string()),
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
let genus = UnifiedSpecies {
scientific_name: "Panthera".to_string(),
common_names: vec!["Panthera".to_string()],
rank: "genus".to_string(),
taxonomy: crate::species::Taxonomy::default(),
ids: crate::species::ExternalIds::default(),
genome: crate::species::GenomeStats::default(),
life_history: crate::species::LifeHistory {
extraction_version: crate::species::CURRENT_LIFE_HISTORY_VERSION,
..crate::species::LifeHistory::default()
},
description: None,
wikipedia_extract: None,
wikipedia_url: None,
conservation_status: None,
iucn_status: None,
observations_count: None,
gbif_occurrences: None,
top_countries: Vec::new(),
distribution: crate::species::Distribution::default(),
images: Vec::new(),
};
db.cache_species(&lion).unwrap();
db.cache_rich_species(&lion).unwrap();
db.cache_rich_species(&wolf).unwrap();
db.cache_rich_species(&tax_only_cat).unwrap();
db.cache_rich_species(&non_curated).unwrap();
db.cache_species(&genus).unwrap();
let names = db.get_cached_species_names(10).unwrap();
assert_eq!(names, vec!["Canis lupus", "Panthera leo"]);
}
}