use std::sync::Arc;
use ahash::AHashMap;
use crate::analysis::analyzer::analyzer::Analyzer;
use crate::analysis::analyzer::per_field::PerFieldAnalyzer;
use crate::analysis::token::Token;
use crate::error::Result;
use crate::lexical::core::analyzed::{AnalyzedDocument, AnalyzedTerm};
use crate::lexical::core::document::Document;
use crate::lexical::core::field::FieldValue;
pub struct DocumentParser {
analyzer: Arc<dyn Analyzer>,
}
impl std::fmt::Debug for DocumentParser {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("DocumentParser")
.field("analyzer", &self.analyzer.name())
.finish()
}
}
impl DocumentParser {
pub fn new(analyzer: Arc<dyn Analyzer>) -> Self {
DocumentParser { analyzer }
}
pub fn parse(&self, doc: Document) -> Result<AnalyzedDocument> {
let mut field_terms = AHashMap::new();
let mut stored_fields = AHashMap::new();
let mut point_values = AHashMap::new();
for (field_name, field) in &doc.fields {
match field.clone() {
FieldValue::Text(text) => {
let tokens = if let Some(per_field) =
self.analyzer.as_any().downcast_ref::<PerFieldAnalyzer>()
{
per_field.analyze_field(field_name.as_str(), text.as_str())?
} else {
self.analyzer.analyze(text.as_str())?
};
let token_vec: Vec<Token> = tokens.collect();
let analyzed_terms = self.tokens_to_analyzed_terms(token_vec);
field_terms.insert(field_name.clone(), analyzed_terms);
stored_fields.insert(field_name.clone(), FieldValue::Text(text.to_string()));
}
FieldValue::Int64(num) => {
let text = num.to_string();
let analyzed_term = AnalyzedTerm {
term: text.clone(),
position: 0,
frequency: 1,
offset: (0, text.len()),
};
field_terms.insert(field_name.clone(), vec![analyzed_term]);
stored_fields.insert(field_name.clone(), FieldValue::Int64(num));
point_values.insert(field_name.clone(), vec![num as f64]);
}
FieldValue::Float64(num) => {
let text = num.to_string();
let analyzed_term = AnalyzedTerm {
term: text.clone(),
position: 0,
frequency: 1,
offset: (0, text.len()),
};
field_terms.insert(field_name.clone(), vec![analyzed_term]);
stored_fields.insert(field_name.clone(), FieldValue::Float64(num));
point_values.insert(field_name.clone(), vec![num]);
}
FieldValue::Bool(b) => {
let text = b.to_string();
let analyzed_term = AnalyzedTerm {
term: text.clone(),
position: 0,
frequency: 1,
offset: (0, text.len()),
};
field_terms.insert(field_name.clone(), vec![analyzed_term]);
stored_fields.insert(field_name.clone(), FieldValue::Bool(b));
}
FieldValue::Bytes(data, mime) => {
stored_fields.insert(
field_name.clone(),
FieldValue::Bytes(data.clone(), mime.clone()),
);
}
FieldValue::DateTime(dt) => {
let text = dt.to_rfc3339();
let analyzed_term = AnalyzedTerm {
term: text.clone(),
position: 0,
frequency: 1,
offset: (0, text.len()),
};
field_terms.insert(field_name.clone(), vec![analyzed_term]);
stored_fields.insert(field_name.clone(), FieldValue::DateTime(dt));
let ts = dt.timestamp() as f64
+ dt.timestamp_subsec_nanos() as f64 / 1_000_000_000.0;
point_values.insert(field_name.clone(), vec![ts]);
}
FieldValue::Geo(lat, lon) => {
let text = format!("{},{}", lat, lon);
let analyzed_term = AnalyzedTerm {
term: text.clone(),
position: 0,
frequency: 1,
offset: (0, text.len()),
};
field_terms.insert(field_name.clone(), vec![analyzed_term]);
stored_fields.insert(field_name.clone(), FieldValue::Geo(lat, lon));
point_values.insert(field_name.clone(), vec![lat, lon]);
}
FieldValue::Vector(v) => {
stored_fields.insert(field_name.clone(), FieldValue::Vector(v.clone()));
}
FieldValue::Null => {
stored_fields.insert(field_name.clone(), FieldValue::Null);
}
}
}
let mut field_lengths = AHashMap::new();
for (field_name, terms) in &field_terms {
field_lengths.insert(field_name.clone(), terms.len() as u32);
}
Ok(AnalyzedDocument {
field_terms,
stored_fields,
field_lengths,
point_values,
})
}
fn tokens_to_analyzed_terms(&self, tokens: Vec<Token>) -> Vec<AnalyzedTerm> {
type TermPositionMap = AHashMap<String, Vec<(u32, (usize, usize))>>;
let mut term_positions: TermPositionMap = AHashMap::new();
for token in tokens {
term_positions.entry(token.text.clone()).or_default().push((
token.position as u32,
(token.start_offset, token.end_offset),
));
}
term_positions
.into_iter()
.map(|(term, positions)| {
let frequency = positions.len() as u32;
let position = positions[0].0; let offset = positions[0].1;
AnalyzedTerm {
term,
position,
frequency,
offset,
}
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::analysis::analyzer::keyword::KeywordAnalyzer;
use crate::analysis::analyzer::standard::StandardAnalyzer;
#[test]
fn test_basic_parsing() {
let parser = DocumentParser::new(Arc::new(StandardAnalyzer::new().unwrap()));
let doc = Document::builder()
.add_text("title", "Rust Programming")
.add_text("body", "Learn Rust")
.build();
let analyzed = parser.parse(doc).unwrap();
assert!(analyzed.field_terms.contains_key("title"));
assert!(analyzed.field_terms.contains_key("body"));
}
#[test]
fn test_per_field_analyzer() {
let per_field = PerFieldAnalyzer::new(Arc::new(StandardAnalyzer::new().unwrap()));
per_field.add_analyzer("id", Arc::new(KeywordAnalyzer::new()));
let parser = DocumentParser::new(Arc::new(per_field));
let doc = Document::builder()
.add_text("title", "Rust Programming")
.add_text("id", "BOOK-001")
.build();
let analyzed = parser.parse(doc).unwrap();
assert!(!analyzed.field_terms.get("title").unwrap().is_empty());
assert_eq!(analyzed.field_terms.get("id").unwrap().len(), 1);
assert_eq!(analyzed.field_terms.get("id").unwrap()[0].term, "BOOK-001"); }
#[test]
fn test_numeric_fields() {
let parser = DocumentParser::new(Arc::new(StandardAnalyzer::new().unwrap()));
let doc = Document::builder()
.add_text("title", "Test")
.add_integer("year", 2024)
.add_float("price", 19.99)
.add_boolean("active", true)
.build();
let analyzed = parser.parse(doc).unwrap();
assert!(analyzed.field_terms.contains_key("year"));
assert!(analyzed.field_terms.contains_key("price"));
assert!(analyzed.field_terms.contains_key("active"));
}
}