use super::*;
pub struct EntityExtractor;
impl Default for EntityExtractor {
fn default() -> Self {
Self::new()
}
}
impl EntityExtractor {
pub fn new() -> Self {
Self
}
pub async fn extract_entities_and_relationships(
&self,
query: &str,
) -> Result<(Vec<ExtractedEntity>, Vec<ExtractedRelationship>)> {
match self.llm_extract_entities(query).await {
Ok(result) => Ok(result),
_ => {
warn!("LLM extraction failed, falling back to rule-based extraction");
self.rule_based_extraction(query).await
}
}
}
async fn llm_extract_entities(
&self,
query: &str,
) -> Result<(Vec<ExtractedEntity>, Vec<ExtractedRelationship>)> {
use crate::llm::{
ChatMessage, ChatRole, LLMConfig, LLMManager, LLMRequest, Priority, UseCase,
};
let prompt = format!(
r#"Extract entities and relationships from the following query. Return a JSON response with the following structure:
{{
"entities": [
{{
"text": "entity name",
"type": "Person|Organization|Location|Concept|Other",
"confidence": 0.95
}}
],
"relationships": [
{{
"subject": "entity1",
"predicate": "relationship type",
"object": "entity2",
"confidence": 0.85
}}
]
}}
Query: "{query}"
Focus on:
- Named entities (people, places, organizations, concepts)
- Implicit relationships between entities
- Technical terms and domain-specific concepts
- Only extract explicit entities mentioned in the query
JSON Response:"#
);
let llm_config = LLMConfig::default();
let mut llm_manager = LLMManager::new(llm_config)?;
let chat_messages = vec![
ChatMessage {
role: ChatRole::System,
content: "You are an expert at extracting entities and relationships from text. Always respond with valid JSON only.".to_string(),
metadata: None,
},
ChatMessage {
role: ChatRole::User,
content: prompt,
metadata: None,
},
];
let request = LLMRequest {
messages: chat_messages,
system_prompt: Some("Extract entities and relationships as JSON.".to_string()),
use_case: UseCase::SimpleQuery,
priority: Priority::Normal,
max_tokens: Some(500),
temperature: 0.1f32, timeout: Some(std::time::Duration::from_secs(15)),
};
let response = llm_manager.generate_response(request).await?;
self.parse_extraction_response(&response.content)
}
fn parse_extraction_response(
&self,
response: &str,
) -> Result<(Vec<ExtractedEntity>, Vec<ExtractedRelationship>)> {
let cleaned_response = response
.trim()
.strip_prefix("```json")
.unwrap_or(response)
.strip_suffix("```")
.unwrap_or(response)
.trim();
let parsed: serde_json::Value = serde_json::from_str(cleaned_response)?;
let mut entities = Vec::new();
let mut relationships = Vec::new();
if let Some(entity_array) = parsed.get("entities").and_then(|e| e.as_array()) {
for entity_obj in entity_array {
if let (Some(text), Some(entity_type), Some(confidence)) = (
entity_obj.get("text").and_then(|v| v.as_str()),
entity_obj.get("type").and_then(|v| v.as_str()),
entity_obj.get("confidence").and_then(|v| v.as_f64()),
) {
let entity_type = match entity_type {
"Person" => EntityType::Person,
"Organization" => EntityType::Organization,
"Location" => EntityType::Location,
"Concept" => EntityType::Concept,
"Event" => EntityType::Event,
_ => EntityType::Other,
};
entities.push(ExtractedEntity {
text: text.to_string(),
entity_type,
iri: None, confidence: confidence as f32,
aliases: Vec::new(),
});
}
}
}
if let Some(relationship_array) = parsed.get("relationships").and_then(|r| r.as_array()) {
for rel_obj in relationship_array {
if let (Some(subject), Some(predicate), Some(object), Some(confidence)) = (
rel_obj.get("subject").and_then(|v| v.as_str()),
rel_obj.get("predicate").and_then(|v| v.as_str()),
rel_obj.get("object").and_then(|v| v.as_str()),
rel_obj.get("confidence").and_then(|v| v.as_f64()),
) {
relationships.push(ExtractedRelationship {
subject: subject.to_string(),
predicate: predicate.to_string(),
object: object.to_string(),
confidence: confidence as f32,
relation_type: RelationType::Other, });
}
}
}
debug!(
"Extracted {} entities and {} relationships",
entities.len(),
relationships.len()
);
Ok((entities, relationships))
}
async fn rule_based_extraction(
&self,
query: &str,
) -> Result<(Vec<ExtractedEntity>, Vec<ExtractedRelationship>)> {
let mut entities = Vec::new();
let mut relationships = Vec::new();
let words: Vec<&str> = query.split_whitespace().collect();
for (i, word) in words.iter().enumerate() {
if word.chars().next().is_some_and(|c| c.is_uppercase()) && word.len() > 2 {
if !self.is_stop_word(&word.to_lowercase()) {
entities.push(ExtractedEntity {
text: word.to_string(),
entity_type: EntityType::Other,
iri: None,
confidence: 0.6, aliases: Vec::new(),
});
}
}
if i > 0 && i < words.len() - 1 {
let prev_word = words[i - 1];
let next_word = words[i + 1];
if word.to_lowercase() == "is" || word.to_lowercase() == "has" {
relationships.push(ExtractedRelationship {
subject: prev_word.to_string(),
predicate: word.to_string(),
object: next_word.to_string(),
confidence: 0.5,
relation_type: RelationType::ConceptualRelation,
});
}
}
}
debug!(
"Rule-based extraction found {} entities and {} relationships",
entities.len(),
relationships.len()
);
Ok((entities, relationships))
}
fn is_stop_word(&self, word: &str) -> bool {
matches!(
word,
"the"
| "and"
| "or"
| "but"
| "in"
| "on"
| "at"
| "to"
| "for"
| "of"
| "with"
| "by"
| "from"
| "up"
| "about"
| "into"
| "through"
| "during"
| "before"
| "after"
| "above"
| "below"
| "between"
| "among"
| "this"
| "that"
| "these"
| "those"
| "i"
| "you"
| "he"
| "she"
| "it"
| "we"
| "they"
| "me"
| "him"
| "her"
| "us"
| "them"
| "my"
| "your"
| "his"
| "its"
| "our"
| "their"
| "am"
| "is"
| "are"
| "was"
| "were"
| "be"
| "been"
| "being"
| "have"
| "has"
| "had"
| "do"
| "does"
| "did"
| "will"
| "would"
| "could"
| "should"
| "may"
| "might"
| "must"
| "can"
| "what"
| "when"
| "where"
| "who"
| "why"
| "how"
| "which"
)
}
}
pub struct LLMEntityExtraction {
pub entities: Vec<ExtractedEntity>,
pub relationships: Vec<ExtractedRelationship>,
}
use super::graph_traversal::{EntityType, ExtractedEntity, ExtractedRelationship, RelationType};