use tantivy::schema::{
FAST, Field, IndexRecordOption, NumericOptions, STORED, STRING, Schema, SchemaBuilder,
TextFieldIndexing, TextOptions,
};
#[derive(Debug)]
pub struct DocumentSchema {
pub doc_type: Field,
pub chunk_id: Field,
pub collection_name: Field,
pub source_path: Field,
pub heading_context: Field,
pub content: Field,
pub content_preview: Field,
pub byte_start: Field,
pub byte_end: Field,
pub char_count: Field,
pub file_hash: Field,
pub indexed_at: Field,
pub meta_key: Field,
pub meta_value: Field,
}
impl DocumentSchema {
pub fn build() -> (Schema, Self) {
let mut builder = SchemaBuilder::default();
let doc_type = builder.add_text_field("doc_type", STRING | STORED | FAST);
let indexed_u64 = NumericOptions::default()
.set_indexed()
.set_stored()
.set_fast();
let chunk_id = builder.add_u64_field("chunk_id", indexed_u64.clone());
let collection_name = builder.add_text_field("collection_name", STRING | STORED | FAST);
let source_path = builder.add_text_field("source_path", STRING | STORED);
let text_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let heading_context = builder.add_text_field("heading_context", text_options.clone());
let content = builder.add_text_field("content", text_options.clone());
let content_preview = builder.add_text_field("content_preview", STORED);
let byte_start = builder.add_u64_field("byte_start", STORED);
let byte_end = builder.add_u64_field("byte_end", STORED);
let char_count = builder.add_u64_field("char_count", STORED);
let file_hash = builder.add_text_field("file_hash", STRING | STORED);
let indexed_at = builder.add_u64_field("indexed_at", STORED | FAST);
let meta_key = builder.add_text_field("meta_key", STRING | STORED | FAST);
let meta_value = builder.add_u64_field("meta_value", STORED | FAST);
let schema = builder.build();
let document_schema = Self {
doc_type,
chunk_id,
collection_name,
source_path,
heading_context,
content,
content_preview,
byte_start,
byte_end,
char_count,
file_hash,
indexed_at,
meta_key,
meta_value,
};
(schema, document_schema)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_schema_build() {
let (schema, _fields) = DocumentSchema::build();
assert!(schema.get_field("doc_type").is_ok());
assert!(schema.get_field("chunk_id").is_ok());
assert!(schema.get_field("collection_name").is_ok());
assert!(schema.get_field("source_path").is_ok());
assert!(schema.get_field("heading_context").is_ok());
assert!(schema.get_field("content").is_ok());
assert!(schema.get_field("content_preview").is_ok());
assert!(schema.get_field("byte_start").is_ok());
assert!(schema.get_field("byte_end").is_ok());
assert_eq!(schema.fields().count(), 14);
}
}