Skip to main content

sqlite_graphrag/
entity_type.rs

1//! Canonical entity type taxonomy used across extraction, storage and CLI.
2//!
3//! `EntityType` is the single source of truth for the 13 graph entity kinds.
4//! It derives `clap::ValueEnum` so CLI flags can use it directly, and derives
5//! `serde::{Serialize, Deserialize}` with `rename_all = "lowercase"` so JSON
6//! round-trips remain backward-compatible with the pre-enum string format.
7
8use crate::errors::AppError;
9
10/// The 13 canonical graph entity classifications.
11///
12/// Values are serialized as lowercase strings (`"person"`, `"organization"`,
13/// etc.) matching the pre-enum wire format and the SQLite `type` column.
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, clap::ValueEnum)]
15#[serde(rename_all = "snake_case")]
16#[clap(rename_all = "snake_case")]
17pub enum EntityType {
18    Concept,
19    Date,
20    Dashboard,
21    Decision,
22    File,
23    Incident,
24    IssueTracker,
25    Location,
26    Memory,
27    Organization,
28    Person,
29    Project,
30    Tool,
31}
32
33impl EntityType {
34    /// Returns the canonical lowercase string representation stored in SQLite.
35    pub fn as_str(self) -> &'static str {
36        match self {
37            EntityType::Concept => "concept",
38            EntityType::Date => "date",
39            EntityType::Dashboard => "dashboard",
40            EntityType::Decision => "decision",
41            EntityType::File => "file",
42            EntityType::Incident => "incident",
43            EntityType::IssueTracker => "issue_tracker",
44            EntityType::Location => "location",
45            EntityType::Memory => "memory",
46            EntityType::Organization => "organization",
47            EntityType::Person => "person",
48            EntityType::Project => "project",
49            EntityType::Tool => "tool",
50        }
51    }
52
53    /// Maps an arbitrary type label to the closest canonical [`EntityType`],
54    /// never failing (GAP-SG-47).
55    ///
56    /// LLM extraction routinely emits type labels outside the 13 canonical
57    /// kinds (`platform`, `language`, `feature`, `framework`, ...). The old
58    /// parse path discarded those entities with a `WARN`, silently losing
59    /// legitimate graph nodes. This function PRESERVES them by folding each
60    /// label onto the nearest canonical kind. Anything it cannot place falls
61    /// back to [`EntityType::Concept`], the most general kind — so a label is
62    /// never dropped.
63    ///
64    /// Matching is case-insensitive and treats hyphens as underscores, so
65    /// `"Issue-Tracker"` resolves to [`EntityType::IssueTracker`].
66    pub fn map_to_canonical(s: &str) -> EntityType {
67        let key = s.trim().to_lowercase().replace('-', "_");
68        // Exact canonical (and case/hyphen-insensitive) match first.
69        if let Ok(et) = key.parse::<EntityType>() {
70            return et;
71        }
72        match key.as_str() {
73            // Concept-like: abstractions, technologies, capabilities, topics.
74            "platform" | "language" | "feature" | "framework" | "library" | "technology"
75            | "software" | "service" | "product" | "system" | "api" | "component" | "module"
76            | "package" | "dependency" | "protocol" | "standard" | "format" | "algorithm"
77            | "pattern" | "method" | "function" | "class" | "interface" | "command" | "flag"
78            | "option" | "config" | "setting" | "version" | "release" | "model" | "metric"
79            | "topic" | "skill" | "reference" | "note" | "feedback" | "url" | "link"
80            | "keyword" | "tag" | "category" => EntityType::Concept,
81            // File-like: documents, paths, code artifacts.
82            "document" | "doc" | "artifact" | "directory" | "folder" | "path" | "repository"
83            | "repo" | "codebase" | "script" => EntityType::File,
84            // Person-like roles.
85            "user" | "author" | "developer" | "maintainer" | "contributor" | "agent" | "owner"
86            | "assignee" => EntityType::Person,
87            // Organization-like collectives.
88            "company" | "org" | "vendor" | "group" | "team" | "department" | "institution" => {
89                EntityType::Organization
90            }
91            // Incident-like failures.
92            "bug" | "error" | "failure" | "outage" | "vulnerability" | "cve" | "regression"
93            | "defect" => EntityType::Incident,
94            // Decision-like records.
95            "adr" | "choice" | "policy" | "ruling" => EntityType::Decision,
96            // Date-like temporals.
97            "time" | "datetime" | "timestamp" | "day" | "month" | "year" | "deadline"
98            | "milestone" => EntityType::Date,
99            // Location-like places.
100            "city" | "country" | "region" | "place" | "address" | "site" => EntityType::Location,
101            // Issue-tracker-like.
102            "ticket" | "issue" | "jira" | "github_issue" | "pr" | "pull_request" => {
103                EntityType::IssueTracker
104            }
105            // Dashboard-like.
106            "panel" | "board" | "view" | "report" | "chart" => EntityType::Dashboard,
107            // Anything else: the most general canonical kind, never dropped.
108            _ => EntityType::Concept,
109        }
110    }
111}
112
113/// v1.1.1 (P7, Limitação 9): manual `Deserialize` that delegates to
114/// [`std::str::FromStr`], so EVERY JSON entry point (`--graph-stdin`,
115/// `--entities-file`, `--graph-file`, enrich payloads) rejects an invalid
116/// `entity_type` EARLY — before any embedding — with the full list of the 13
117/// valid values and the memory-type→entity-type hints (`reference`→`concept`,
118/// `document`→`file`, `user`→`person`), instead of serde's terse
119/// `unknown variant`. Also case-insensitive, matching the CLI parse path.
120impl<'de> serde::Deserialize<'de> for EntityType {
121    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
122    where
123        D: serde::Deserializer<'de>,
124    {
125        let s = String::deserialize(deserializer)?;
126        s.parse::<EntityType>().map_err(serde::de::Error::custom)
127    }
128}
129
130impl std::fmt::Display for EntityType {
131    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
132        f.write_str(self.as_str())
133    }
134}
135
136impl std::str::FromStr for EntityType {
137    type Err = AppError;
138
139    fn from_str(s: &str) -> Result<Self, Self::Err> {
140        match s.to_lowercase().as_str() {
141            "concept" => Ok(EntityType::Concept),
142            "date" => Ok(EntityType::Date),
143            "dashboard" => Ok(EntityType::Dashboard),
144            "decision" => Ok(EntityType::Decision),
145            "file" => Ok(EntityType::File),
146            "incident" => Ok(EntityType::Incident),
147            "issue_tracker" => Ok(EntityType::IssueTracker),
148            "location" => Ok(EntityType::Location),
149            "memory" => Ok(EntityType::Memory),
150            "organization" => Ok(EntityType::Organization),
151            "person" => Ok(EntityType::Person),
152            "project" => Ok(EntityType::Project),
153            "tool" => Ok(EntityType::Tool),
154            other => {
155                let hint = match other {
156                    "reference" | "skill" | "note" | "feedback" => Some("concept"),
157                    "document" => Some("file"),
158                    "user" => Some("person"),
159                    _ => None,
160                };
161                let msg = if let Some(suggested) = hint {
162                    format!(
163                        "invalid entity_type '{other}'; '{other}' is a MEMORY type, not an entity type. \
164                         Try '{suggested}' instead. Valid entity types: concept, date, dashboard, \
165                         decision, file, incident, issue_tracker, location, memory, organization, \
166                         person, project, tool"
167                    )
168                } else {
169                    format!(
170                        "invalid entity type: {other}; expected one of: concept, date, dashboard, \
171                         decision, file, incident, issue_tracker, location, memory, organization, \
172                         person, project, tool"
173                    )
174                };
175                Err(AppError::Validation(msg))
176            }
177        }
178    }
179}
180
181impl rusqlite::types::FromSql for EntityType {
182    fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
183        let s = String::column_result(value)?;
184        s.parse::<EntityType>().map_err(|e| {
185            rusqlite::types::FromSqlError::Other(Box::new(std::io::Error::other(e.to_string())))
186        })
187    }
188}
189
190impl rusqlite::types::ToSql for EntityType {
191    fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
192        Ok(rusqlite::types::ToSqlOutput::from(self.as_str()))
193    }
194}
195
196#[cfg(test)]
197mod tests {
198    use super::*;
199
200    // v1.1.1 (P7): serde now delegates to FromStr — invalid entity_type fails
201    // at the JSON boundary with the full valid-values list and hints.
202    #[test]
203    fn deserialize_invalid_entity_type_lists_valid_values_and_hint() {
204        let err = serde_json::from_str::<EntityType>("\"reference\"").unwrap_err();
205        let msg = err.to_string();
206        assert!(msg.contains("MEMORY type"), "obtido: {msg}");
207        assert!(msg.contains("Try 'concept'"), "obtido: {msg}");
208        assert!(msg.contains("issue_tracker"), "obtido: {msg}");
209
210        let err = serde_json::from_str::<EntityType>("\"banana\"").unwrap_err();
211        let msg = err.to_string();
212        assert!(msg.contains("expected one of"), "obtido: {msg}");
213        assert!(msg.contains("dashboard"), "obtido: {msg}");
214    }
215
216    #[test]
217    fn deserialize_valid_and_case_insensitive_entity_type() {
218        assert_eq!(
219            serde_json::from_str::<EntityType>("\"issue_tracker\"").unwrap(),
220            EntityType::IssueTracker
221        );
222        // FromStr lowercases, so serde now accepts mixed case like the CLI.
223        assert_eq!(
224            serde_json::from_str::<EntityType>("\"Tool\"").unwrap(),
225            EntityType::Tool
226        );
227    }
228
229    #[test]
230    fn serialize_stays_snake_case() {
231        assert_eq!(
232            serde_json::to_string(&EntityType::IssueTracker).unwrap(),
233            "\"issue_tracker\""
234        );
235    }
236
237    #[test]
238    fn from_str_lowercase_roundtrip() {
239        assert_eq!("person".parse::<EntityType>().unwrap(), EntityType::Person);
240        assert_eq!(
241            "organization".parse::<EntityType>().unwrap(),
242            EntityType::Organization
243        );
244        assert_eq!(
245            "issue_tracker".parse::<EntityType>().unwrap(),
246            EntityType::IssueTracker
247        );
248    }
249
250    #[test]
251    fn from_str_uppercase_is_case_insensitive() {
252        assert_eq!("PERSON".parse::<EntityType>().unwrap(), EntityType::Person);
253        assert_eq!(
254            "Organization".parse::<EntityType>().unwrap(),
255            EntityType::Organization
256        );
257    }
258
259    #[test]
260    fn from_str_invalid_returns_err() {
261        let result = "invalid".parse::<EntityType>();
262        assert!(result.is_err());
263        let msg = result.unwrap_err().to_string();
264        assert!(msg.contains("invalid entity type"));
265    }
266
267    #[test]
268    fn as_str_returns_canonical_lowercase() {
269        assert_eq!(EntityType::Person.as_str(), "person");
270        assert_eq!(EntityType::IssueTracker.as_str(), "issue_tracker");
271    }
272
273    #[test]
274    fn serde_json_serializes_as_lowercase_string() {
275        let json = serde_json::to_string(&EntityType::Person).unwrap();
276        assert_eq!(json, "\"person\"");
277        let json = serde_json::to_string(&EntityType::IssueTracker).unwrap();
278        assert_eq!(json, "\"issue_tracker\"");
279    }
280
281    #[test]
282    fn serde_json_deserializes_from_lowercase_string() {
283        let et: EntityType = serde_json::from_str("\"person\"").unwrap();
284        assert_eq!(et, EntityType::Person);
285    }
286
287    #[test]
288    fn map_to_canonical_preserves_canonical_types() {
289        assert_eq!(EntityType::map_to_canonical("person"), EntityType::Person);
290        assert_eq!(EntityType::map_to_canonical("concept"), EntityType::Concept);
291        assert_eq!(
292            EntityType::map_to_canonical("issue_tracker"),
293            EntityType::IssueTracker
294        );
295        // Hyphen + case variants normalize to the canonical kind.
296        assert_eq!(
297            EntityType::map_to_canonical("Issue-Tracker"),
298            EntityType::IssueTracker
299        );
300    }
301
302    #[test]
303    fn map_to_canonical_folds_non_canonical_instead_of_discarding() {
304        // GAP-SG-47: platform/language/feature were previously DISCARDED.
305        assert_eq!(
306            EntityType::map_to_canonical("platform"),
307            EntityType::Concept
308        );
309        assert_eq!(
310            EntityType::map_to_canonical("language"),
311            EntityType::Concept
312        );
313        assert_eq!(EntityType::map_to_canonical("feature"), EntityType::Concept);
314        // Role/collective folds.
315        assert_eq!(
316            EntityType::map_to_canonical("developer"),
317            EntityType::Person
318        );
319        assert_eq!(
320            EntityType::map_to_canonical("company"),
321            EntityType::Organization
322        );
323        assert_eq!(EntityType::map_to_canonical("document"), EntityType::File);
324    }
325
326    #[test]
327    fn map_to_canonical_unknown_falls_back_to_concept_never_dropped() {
328        assert_eq!(
329            EntityType::map_to_canonical("totally-made-up-kind"),
330            EntityType::Concept
331        );
332        assert_eq!(EntityType::map_to_canonical(""), EntityType::Concept);
333    }
334}