Skip to main content

sqlite_graphrag/
entity_type.rs

1//! Canonical entity type taxonomy used across extraction, storage and CLI.
2//!
3//! `EntityType` is the single source of truth for the 13 graph entity kinds.
4//! It derives `clap::ValueEnum` so CLI flags can use it directly, and derives
5//! `serde::{Serialize, Deserialize}` with `rename_all = "lowercase"` so JSON
6//! round-trips remain backward-compatible with the pre-enum string format.
7
8use crate::errors::AppError;
9
10/// The 13 canonical graph entity classifications.
11///
12/// Values are serialized as lowercase strings (`"person"`, `"organization"`,
13/// etc.) matching the pre-enum wire format and the SQLite `type` column.
14#[derive(
15    Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, clap::ValueEnum,
16)]
17#[serde(rename_all = "snake_case")]
18#[clap(rename_all = "snake_case")]
19pub enum EntityType {
20    Concept,
21    Date,
22    Dashboard,
23    Decision,
24    File,
25    Incident,
26    IssueTracker,
27    Location,
28    Memory,
29    Organization,
30    Person,
31    Project,
32    Tool,
33}
34
35impl EntityType {
36    /// Returns the canonical lowercase string representation stored in SQLite.
37    pub fn as_str(self) -> &'static str {
38        match self {
39            EntityType::Concept => "concept",
40            EntityType::Date => "date",
41            EntityType::Dashboard => "dashboard",
42            EntityType::Decision => "decision",
43            EntityType::File => "file",
44            EntityType::Incident => "incident",
45            EntityType::IssueTracker => "issue_tracker",
46            EntityType::Location => "location",
47            EntityType::Memory => "memory",
48            EntityType::Organization => "organization",
49            EntityType::Person => "person",
50            EntityType::Project => "project",
51            EntityType::Tool => "tool",
52        }
53    }
54
55    /// Maps an arbitrary type label to the closest canonical [`EntityType`],
56    /// never failing (GAP-SG-47).
57    ///
58    /// LLM extraction routinely emits type labels outside the 13 canonical
59    /// kinds (`platform`, `language`, `feature`, `framework`, ...). The old
60    /// parse path discarded those entities with a `WARN`, silently losing
61    /// legitimate graph nodes. This function PRESERVES them by folding each
62    /// label onto the nearest canonical kind. Anything it cannot place falls
63    /// back to [`EntityType::Concept`], the most general kind — so a label is
64    /// never dropped.
65    ///
66    /// Matching is case-insensitive and treats hyphens as underscores, so
67    /// `"Issue-Tracker"` resolves to [`EntityType::IssueTracker`].
68    pub fn map_to_canonical(s: &str) -> EntityType {
69        let key = s.trim().to_lowercase().replace('-', "_");
70        // Exact canonical (and case/hyphen-insensitive) match first.
71        if let Ok(et) = key.parse::<EntityType>() {
72            return et;
73        }
74        match key.as_str() {
75            // Concept-like: abstractions, technologies, capabilities, topics.
76            "platform" | "language" | "feature" | "framework" | "library" | "technology"
77            | "software" | "service" | "product" | "system" | "api" | "component" | "module"
78            | "package" | "dependency" | "protocol" | "standard" | "format" | "algorithm"
79            | "pattern" | "method" | "function" | "class" | "interface" | "command" | "flag"
80            | "option" | "config" | "setting" | "version" | "release" | "model" | "metric"
81            | "topic" | "skill" | "reference" | "note" | "feedback" | "url" | "link"
82            | "keyword" | "tag" | "category" => EntityType::Concept,
83            // File-like: documents, paths, code artifacts.
84            "document" | "doc" | "artifact" | "directory" | "folder" | "path" | "repository"
85            | "repo" | "codebase" | "script" => EntityType::File,
86            // Person-like roles.
87            "user" | "author" | "developer" | "maintainer" | "contributor" | "agent" | "owner"
88            | "assignee" => EntityType::Person,
89            // Organization-like collectives.
90            "company" | "org" | "vendor" | "group" | "team" | "department" | "institution" => {
91                EntityType::Organization
92            }
93            // Incident-like failures.
94            "bug" | "error" | "failure" | "outage" | "vulnerability" | "cve" | "regression"
95            | "defect" => EntityType::Incident,
96            // Decision-like records.
97            "adr" | "choice" | "policy" | "ruling" => EntityType::Decision,
98            // Date-like temporals.
99            "time" | "datetime" | "timestamp" | "day" | "month" | "year" | "deadline"
100            | "milestone" => EntityType::Date,
101            // Location-like places.
102            "city" | "country" | "region" | "place" | "address" | "site" => EntityType::Location,
103            // Issue-tracker-like.
104            "ticket" | "issue" | "jira" | "github_issue" | "pr" | "pull_request" => {
105                EntityType::IssueTracker
106            }
107            // Dashboard-like.
108            "panel" | "board" | "view" | "report" | "chart" => EntityType::Dashboard,
109            // Anything else: the most general canonical kind, never dropped.
110            _ => EntityType::Concept,
111        }
112    }
113}
114
115impl std::fmt::Display for EntityType {
116    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
117        f.write_str(self.as_str())
118    }
119}
120
121impl std::str::FromStr for EntityType {
122    type Err = AppError;
123
124    fn from_str(s: &str) -> Result<Self, Self::Err> {
125        match s.to_lowercase().as_str() {
126            "concept" => Ok(EntityType::Concept),
127            "date" => Ok(EntityType::Date),
128            "dashboard" => Ok(EntityType::Dashboard),
129            "decision" => Ok(EntityType::Decision),
130            "file" => Ok(EntityType::File),
131            "incident" => Ok(EntityType::Incident),
132            "issue_tracker" => Ok(EntityType::IssueTracker),
133            "location" => Ok(EntityType::Location),
134            "memory" => Ok(EntityType::Memory),
135            "organization" => Ok(EntityType::Organization),
136            "person" => Ok(EntityType::Person),
137            "project" => Ok(EntityType::Project),
138            "tool" => Ok(EntityType::Tool),
139            other => {
140                let hint = match other {
141                    "reference" | "skill" | "note" | "feedback" => Some("concept"),
142                    "document" => Some("file"),
143                    "user" => Some("person"),
144                    _ => None,
145                };
146                let msg = if let Some(suggested) = hint {
147                    format!(
148                        "invalid entity_type '{other}'; '{other}' is a MEMORY type, not an entity type. \
149                         Try '{suggested}' instead. Valid entity types: concept, date, dashboard, \
150                         decision, file, incident, issue_tracker, location, memory, organization, \
151                         person, project, tool"
152                    )
153                } else {
154                    format!(
155                        "invalid entity type: {other}; expected one of: concept, date, dashboard, \
156                         decision, file, incident, issue_tracker, location, memory, organization, \
157                         person, project, tool"
158                    )
159                };
160                Err(AppError::Validation(msg))
161            }
162        }
163    }
164}
165
166impl rusqlite::types::FromSql for EntityType {
167    fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
168        let s = String::column_result(value)?;
169        s.parse::<EntityType>().map_err(|e| {
170            rusqlite::types::FromSqlError::Other(Box::new(std::io::Error::other(e.to_string())))
171        })
172    }
173}
174
175impl rusqlite::types::ToSql for EntityType {
176    fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
177        Ok(rusqlite::types::ToSqlOutput::from(self.as_str()))
178    }
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn from_str_lowercase_roundtrip() {
187        assert_eq!("person".parse::<EntityType>().unwrap(), EntityType::Person);
188        assert_eq!(
189            "organization".parse::<EntityType>().unwrap(),
190            EntityType::Organization
191        );
192        assert_eq!(
193            "issue_tracker".parse::<EntityType>().unwrap(),
194            EntityType::IssueTracker
195        );
196    }
197
198    #[test]
199    fn from_str_uppercase_is_case_insensitive() {
200        assert_eq!("PERSON".parse::<EntityType>().unwrap(), EntityType::Person);
201        assert_eq!(
202            "Organization".parse::<EntityType>().unwrap(),
203            EntityType::Organization
204        );
205    }
206
207    #[test]
208    fn from_str_invalid_returns_err() {
209        let result = "invalid".parse::<EntityType>();
210        assert!(result.is_err());
211        let msg = result.unwrap_err().to_string();
212        assert!(msg.contains("invalid entity type"));
213    }
214
215    #[test]
216    fn as_str_returns_canonical_lowercase() {
217        assert_eq!(EntityType::Person.as_str(), "person");
218        assert_eq!(EntityType::IssueTracker.as_str(), "issue_tracker");
219    }
220
221    #[test]
222    fn serde_json_serializes_as_lowercase_string() {
223        let json = serde_json::to_string(&EntityType::Person).unwrap();
224        assert_eq!(json, "\"person\"");
225        let json = serde_json::to_string(&EntityType::IssueTracker).unwrap();
226        assert_eq!(json, "\"issue_tracker\"");
227    }
228
229    #[test]
230    fn serde_json_deserializes_from_lowercase_string() {
231        let et: EntityType = serde_json::from_str("\"person\"").unwrap();
232        assert_eq!(et, EntityType::Person);
233    }
234
235    #[test]
236    fn map_to_canonical_preserves_canonical_types() {
237        assert_eq!(EntityType::map_to_canonical("person"), EntityType::Person);
238        assert_eq!(EntityType::map_to_canonical("concept"), EntityType::Concept);
239        assert_eq!(
240            EntityType::map_to_canonical("issue_tracker"),
241            EntityType::IssueTracker
242        );
243        // Hyphen + case variants normalize to the canonical kind.
244        assert_eq!(
245            EntityType::map_to_canonical("Issue-Tracker"),
246            EntityType::IssueTracker
247        );
248    }
249
250    #[test]
251    fn map_to_canonical_folds_non_canonical_instead_of_discarding() {
252        // GAP-SG-47: platform/language/feature were previously DISCARDED.
253        assert_eq!(
254            EntityType::map_to_canonical("platform"),
255            EntityType::Concept
256        );
257        assert_eq!(
258            EntityType::map_to_canonical("language"),
259            EntityType::Concept
260        );
261        assert_eq!(EntityType::map_to_canonical("feature"), EntityType::Concept);
262        // Role/collective folds.
263        assert_eq!(
264            EntityType::map_to_canonical("developer"),
265            EntityType::Person
266        );
267        assert_eq!(
268            EntityType::map_to_canonical("company"),
269            EntityType::Organization
270        );
271        assert_eq!(EntityType::map_to_canonical("document"), EntityType::File);
272    }
273
274    #[test]
275    fn map_to_canonical_unknown_falls_back_to_concept_never_dropped() {
276        assert_eq!(
277            EntityType::map_to_canonical("totally-made-up-kind"),
278            EntityType::Concept
279        );
280        assert_eq!(EntityType::map_to_canonical(""), EntityType::Concept);
281    }
282}