pub mod api;
use bincode::Error as BincodeError;
use serde::{Deserialize, Serialize};
use std::cmp::Reverse;
use std::collections::hash_map::DefaultHasher;
use std::convert::From;
use xorf::{Filter as XorfFilter, HashProxy, Xor8};
#[cfg(feature = "bin")]
use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct PostId {
pub title: String,
pub url: String,
pub meta: String,
}
pub type PostFilter = (PostId, HashProxy<String, DefaultHasher, Xor8>);
pub type SearchIndex = Vec<PostFilter>;
pub use api::{BasicPost, Post, TinySearch};
#[cfg(feature = "bin")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchSchemaConfig {
pub schema: SearchSchema,
}
#[cfg(feature = "bin")]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchSchema {
pub indexed_fields: Vec<String>,
pub metadata_fields: Vec<String>,
pub url_field: String,
}
#[cfg(feature = "bin")]
impl Default for SearchSchema {
fn default() -> Self {
Self {
indexed_fields: vec!["title".to_string(), "body".to_string()],
metadata_fields: vec![],
url_field: "url".to_string(),
}
}
}
#[cfg(feature = "bin")]
impl SearchSchema {
pub fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, String> {
let toml_path = path.as_ref().join("tinysearch.toml");
if !toml_path.exists() {
return Ok(Self::default());
}
let toml_content = std::fs::read_to_string(&toml_path)
.map_err(|e| format!("Failed to read tinysearch.toml: {e}"))?;
let config: SearchSchemaConfig = toml::from_str(&toml_content)
.map_err(|e| format!("Failed to parse tinysearch.toml: {e}"))?;
config.schema.validate()?;
Ok(config.schema)
}
pub fn validate(&self) -> Result<(), String> {
if self.indexed_fields.is_empty() {
return Err("indexed_fields cannot be empty".to_string());
}
if self.url_field.is_empty() {
return Err("url_field cannot be empty".to_string());
}
let all_fields: Vec<_> = self
.indexed_fields
.iter()
.chain(self.metadata_fields.iter())
.chain(std::iter::once(&self.url_field))
.collect();
let mut unique_fields = std::collections::HashSet::new();
for field in &all_fields {
if !unique_fields.insert(field) {
return Err(format!("Duplicate field definition: {field}"));
}
}
Ok(())
}
pub fn all_fields(&self) -> Vec<String> {
let mut fields = self.indexed_fields.clone();
fields.extend(self.metadata_fields.clone());
if !fields.contains(&self.url_field) {
fields.push(self.url_field.clone());
}
fields
}
}
#[derive(Serialize, Deserialize)]
pub struct Storage {
pub filters: SearchIndex,
}
impl From<SearchIndex> for Storage {
fn from(filters: SearchIndex) -> Self {
Self { filters }
}
}
pub trait Score {
fn score(&self, terms: &[String]) -> usize;
}
impl Score for HashProxy<String, DefaultHasher, Xor8> {
fn score(&self, terms: &[String]) -> usize {
terms.iter().filter(|term| self.contains(term)).count()
}
}
impl Storage {
pub fn to_bytes(&self) -> Result<Vec<u8>, BincodeError> {
let encoded: Vec<u8> = bincode::serialize(&self)?;
Ok(encoded)
}
pub fn from_bytes(bytes: &[u8]) -> Result<Self, BincodeError> {
let decoded: SearchIndex = bincode::deserialize(bytes)?;
Ok(Self { filters: decoded })
}
}
pub type Filter = HashProxy<String, DefaultHasher, Xor8>;
const TITLE_WEIGHT: usize = 3;
fn score(post_id: &PostId, search_terms: &[String], filter: &Filter) -> usize {
let title_terms: Vec<String> = tokenize(&post_id.title);
let title_score: usize = search_terms
.iter()
.filter(|term| title_terms.contains(term))
.count();
TITLE_WEIGHT
.saturating_mul(title_score)
.saturating_add(filter.score(search_terms))
}
fn tokenize(s: &str) -> Vec<String> {
s.to_lowercase()
.split_whitespace()
.filter(|&t| !t.trim().is_empty())
.map(String::from)
.collect()
}
pub fn search<'index>(
index: &'index SearchIndex,
query: &str,
num_results: usize,
) -> Vec<&'index PostId> {
let search_terms: Vec<String> = tokenize(query);
let mut matches: Vec<(&PostId, usize)> = index
.iter()
.map(|(post_id, filter)| (post_id, score(post_id, &search_terms, filter)))
.filter(|(_post_id, score)| *score > 0)
.collect();
matches.sort_by_key(|k| Reverse(k.1));
matches.into_iter().take(num_results).map(|p| p.0).collect()
}
#[cfg(test)]
#[cfg(feature = "bin")]
#[allow(clippy::panic, clippy::unwrap_used)]
mod schema_tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_default_schema() {
let schema = SearchSchema::default();
assert_eq!(schema.indexed_fields, vec!["title", "body"]);
assert_eq!(schema.metadata_fields, Vec::<String>::new());
assert_eq!(schema.url_field, "url");
if let Err(e) = schema.validate() {
panic!("Default schema validation failed: {e}");
}
}
#[test]
fn test_load_nonexistent_file() {
let temp_dir = TempDir::new().unwrap();
let schema = SearchSchema::load_from_file(temp_dir.path()).unwrap();
assert_eq!(schema.indexed_fields, vec!["title", "body"]);
}
#[test]
fn test_load_valid_toml() {
let temp_dir = TempDir::new().unwrap();
let toml_content = r#"
[schema]
indexed_fields = ["title", "description"]
metadata_fields = ["author", "date", "image_url"]
url_field = "permalink"
"#;
std::fs::write(temp_dir.path().join("tinysearch.toml"), toml_content).unwrap();
let schema = SearchSchema::load_from_file(temp_dir.path()).unwrap();
assert_eq!(schema.indexed_fields, vec!["title", "description"]);
assert_eq!(schema.metadata_fields, vec!["author", "date", "image_url"]);
assert_eq!(schema.url_field, "permalink");
}
#[test]
fn test_validation_empty_indexed_fields() {
let schema = SearchSchema {
indexed_fields: vec![],
metadata_fields: vec!["url".to_string()],
url_field: "url".to_string(),
};
assert!(schema.validate().is_err());
}
#[test]
fn test_validation_empty_url_field() {
let schema = SearchSchema {
indexed_fields: vec!["title".to_string()],
metadata_fields: vec![],
url_field: String::new(),
};
assert!(schema.validate().is_err());
}
#[test]
fn test_validation_duplicate_fields() {
let schema = SearchSchema {
indexed_fields: vec!["title".to_string(), "body".to_string()],
metadata_fields: vec!["title".to_string()], url_field: "url".to_string(),
};
assert!(schema.validate().is_err());
}
#[test]
fn test_all_fields_method() {
let schema = SearchSchema {
indexed_fields: vec!["title".to_string(), "body".to_string()],
metadata_fields: vec!["author".to_string(), "date".to_string()],
url_field: "permalink".to_string(),
};
let all_fields = schema.all_fields();
assert!(all_fields.contains(&"title".to_string()));
assert!(all_fields.contains(&"body".to_string()));
assert!(all_fields.contains(&"author".to_string()));
assert!(all_fields.contains(&"date".to_string()));
assert!(all_fields.contains(&"permalink".to_string()));
}
#[test]
fn test_invalid_toml_format() {
let temp_dir = TempDir::new().unwrap();
let invalid_toml = "this is not valid toml [";
std::fs::write(temp_dir.path().join("tinysearch.toml"), invalid_toml).unwrap();
let result = SearchSchema::load_from_file(temp_dir.path());
assert!(result.is_err());
assert!(result.unwrap_err().contains("Failed to parse"));
}
#[test]
fn test_missing_schema_section() {
let temp_dir = TempDir::new().unwrap();
let toml_content = r#"
[other]
value = "test"
"#;
std::fs::write(temp_dir.path().join("tinysearch.toml"), toml_content).unwrap();
let result = SearchSchema::load_from_file(temp_dir.path());
assert!(result.is_err());
}
}