Skip to main content

sqlite_graphrag/parsers/
mod.rs

1//! Input format parsers (timestamp, range validators).
2
3use chrono::DateTime;
4use unicode_normalization::UnicodeNormalization;
5
6/// Accepts a Unix epoch (integer >= 0) or RFC 3339 timestamp and returns the Unix epoch.
7pub fn parse_expected_updated_at(s: &str) -> Result<i64, String> {
8    if let Ok(secs) = s.parse::<i64>() {
9        if secs >= 0 {
10            return Ok(secs);
11        }
12    }
13    DateTime::parse_from_rfc3339(s)
14        .map(|dt| dt.timestamp())
15        .map_err(|e| {
16            format!(
17                "value must be a Unix epoch (integer >= 0) or RFC 3339 (e.g. 2026-04-19T12:00:00Z): {e}"
18            )
19        })
20}
21
22/// Validates `-k`/`--k` for `recall` and `hybrid-search` to the inclusive range `1..=4096`.
23///
24/// The upper bound matches the `sqlite-vec` knn limit; values above it would surface a leaky
25/// engine error such as `k value in knn query too large, provided 10000 and the limit is 4096`.
26/// Validating at parse time turns the failure into a clean Clap error before any database work.
27pub fn parse_k_range(s: &str) -> Result<usize, String> {
28    let value: usize = s
29        .parse()
30        .map_err(|_| format!("'{s}' is not a valid non-negative integer"))?;
31    if !(1..=4096).contains(&value) {
32        return Err(format!(
33            "k must be between 1 and 4096 (inclusive); got {value}"
34        ));
35    }
36    Ok(value)
37}
38
39/// Flexible boolean parser for Clap env var integration.
40///
41/// Accepts common truthy/falsy conventions used in shell environments:
42/// truthy: `1`, `true`, `yes`, `on` (case-insensitive)
43/// falsy: `0`, `false`, `no`, `off`, empty string (case-insensitive)
44pub fn parse_bool_flexible(s: &str) -> Result<bool, String> {
45    match s.to_lowercase().as_str() {
46        "1" | "true" | "yes" | "on" => Ok(true),
47        "0" | "false" | "no" | "off" | "" => Ok(false),
48        _ => Err(format!(
49            "invalid boolean value '{s}': expected true/false/1/0/yes/no/on/off"
50        )),
51    }
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn accepts_unix_epoch() {
60        assert_eq!(parse_expected_updated_at("1700000000").unwrap(), 1700000000);
61    }
62
63    #[test]
64    fn accepts_zero() {
65        assert_eq!(parse_expected_updated_at("0").unwrap(), 0);
66    }
67
68    #[test]
69    fn accepts_rfc_3339_utc() {
70        let result = parse_expected_updated_at("2020-01-01T00:00:00Z");
71        assert!(result.is_ok());
72        assert_eq!(result.unwrap(), 1577836800);
73    }
74
75    #[test]
76    fn accepts_rfc_3339_with_offset() {
77        let result = parse_expected_updated_at("2026-04-19T12:00:00+00:00");
78        assert!(result.is_ok());
79    }
80
81    #[test]
82    fn rejects_invalid_string() {
83        assert!(parse_expected_updated_at("bananas").is_err());
84    }
85
86    #[test]
87    fn rejects_negative() {
88        let err = parse_expected_updated_at("-1");
89        assert!(err.is_err());
90    }
91
92    #[test]
93    fn error_message_mentions_format() {
94        let msg = parse_expected_updated_at("invalid").unwrap_err();
95        assert!(msg.contains("RFC 3339") || msg.contains("Unix epoch"));
96    }
97
98    #[test]
99    fn k_accepts_valid_range_endpoints() {
100        assert_eq!(parse_k_range("1").unwrap(), 1);
101        assert_eq!(parse_k_range("4096").unwrap(), 4096);
102        assert_eq!(parse_k_range("10").unwrap(), 10);
103    }
104
105    #[test]
106    fn k_rejects_zero() {
107        let msg = parse_k_range("0").unwrap_err();
108        assert!(msg.contains("between 1 and 4096"));
109    }
110
111    #[test]
112    fn k_rejects_above_limit() {
113        let msg = parse_k_range("10000").unwrap_err();
114        assert!(msg.contains("between 1 and 4096"));
115    }
116
117    #[test]
118    fn k_rejects_non_integer() {
119        let msg = parse_k_range("abc").unwrap_err();
120        assert!(msg.contains("not a valid"));
121    }
122
123    #[test]
124    fn k_rejects_negative() {
125        // usize parser fails on negatives before range check
126        assert!(parse_k_range("-5").is_err());
127    }
128
129    #[test]
130    fn bool_flexible_truthy() {
131        for v in &["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
132            assert!(parse_bool_flexible(v).unwrap(), "should be true: {v}");
133        }
134    }
135
136    #[test]
137    fn bool_flexible_falsy() {
138        for v in &["0", "false", "False", "FALSE", "no", "No", "off", "OFF", ""] {
139            assert!(!parse_bool_flexible(v).unwrap(), "should be false: {v}");
140        }
141    }
142
143    #[test]
144    fn bool_flexible_rejects_invalid() {
145        assert!(parse_bool_flexible("banana").is_err());
146        assert!(parse_bool_flexible("2").is_err());
147        assert!(parse_bool_flexible("nope").is_err());
148    }
149}
150
151/// The 12 well-known relation types from v1.0.0.
152///
153/// Non-canonical relations are accepted but emit a `tracing::warn!`.
154pub const CANONICAL_RELATIONS: &[&str] = &[
155    "applies_to",
156    "uses",
157    "depends_on",
158    "causes",
159    "fixes",
160    "contradicts",
161    "supports",
162    "follows",
163    "related",
164    "mentions",
165    "replaces",
166    "tracked_in",
167];
168
169/// Returns `true` when the relation is one of the 12 canonical types.
170pub fn is_canonical_relation(s: &str) -> bool {
171    CANONICAL_RELATIONS.contains(&s)
172}
173
174/// Normalizes a relation string: lowercase + hyphens to underscores.
175pub fn normalize_relation(s: &str) -> String {
176    s.to_lowercase().replace('-', "_")
177}
178
179/// Normalizes an entity name to kebab-case ASCII.
180///
181/// Applies NFKD decomposition, filters to ASCII (transliterating by dropping
182/// diacritical combining marks), lowercases, converts spaces and underscores
183/// to hyphens, collapses consecutive hyphens, and trims leading/trailing hyphens.
184///
185/// # Examples
186///
187/// ```
188/// use sqlite_graphrag::parsers::normalize_entity_name;
189///
190/// assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
191/// assert_eq!(normalize_entity_name("CANONICAL_RELATIONS"), "canonical-relations");
192/// assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
193/// assert_eq!(normalize_entity_name("danilo-aguiar"), "danilo-aguiar"); // idempotent
194/// ```
195pub fn normalize_entity_name(s: &str) -> String {
196    // NFKD: decompose precomposed characters into base + combining marks.
197    // Then keep only ASCII characters, effectively stripping diacritics.
198    let ascii: String = s.nfkd().filter(|c| c.is_ascii()).collect();
199    // Lowercase, then replace spaces and underscores with hyphens.
200    let hyphenated: String = ascii
201        .to_lowercase()
202        .chars()
203        .map(|c| if c.is_ascii_alphanumeric() { c } else { '-' })
204        .collect();
205    // Collapse consecutive hyphens and trim from both ends.
206    let mut result = String::with_capacity(hyphenated.len());
207    let mut prev_was_hyphen = false;
208    for ch in hyphenated.chars() {
209        if ch == '-' {
210            if !prev_was_hyphen {
211                result.push('-');
212            }
213            prev_was_hyphen = true;
214        } else {
215            result.push(ch);
216            prev_was_hyphen = false;
217        }
218    }
219    result.trim_matches('-').to_string()
220}
221
222/// Validates that a normalized relation matches `^[a-z][a-z0-9_]*$`.
223pub fn validate_relation_format(s: &str) -> Result<(), String> {
224    if s.is_empty() {
225        return Err("relation must not be empty".to_string());
226    }
227    if !s.as_bytes()[0].is_ascii_lowercase() {
228        return Err(format!(
229            "relation must start with a lowercase letter, got '{s}'"
230        ));
231    }
232    if !s
233        .bytes()
234        .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_')
235    {
236        return Err(format!(
237            "relation must contain only lowercase letters, digits and underscores, got '{s}'"
238        ));
239    }
240    Ok(())
241}
242
243/// Maps an arbitrary relation label to its canonical form, never producing a
244/// non-canonical value (GAP-SG-48).
245///
246/// Relation handling used to be inconsistent: non-canonical relations were
247/// accepted raw (with only a `WARN`) while non-canonical entity types were
248/// rejected outright. This unifies the policy — extraction never persists a
249/// label outside the canonical vocabulary. Known aliases are rewritten via a
250/// fixed table; values that are already canonical pass through unchanged;
251/// anything else falls back to the generic `related`.
252///
253/// Alias table (mirrors the project's canonical relation map):
254/// `adds`/`creates` → `causes`, `implements` → `supports`,
255/// `blocks` → `contradicts`, `tested_by` → `related`, `part_of` → `applies_to`.
256pub fn map_to_canonical_relation(s: &str) -> String {
257    let normalized = normalize_relation(s);
258    if is_canonical_relation(&normalized) {
259        return normalized;
260    }
261    match normalized.as_str() {
262        "adds" | "creates" => "causes",
263        "implements" => "supports",
264        "blocks" => "contradicts",
265        "tested_by" => "related",
266        "part_of" => "applies_to",
267        // Any other non-canonical relation folds onto the generic canonical
268        // kind rather than being persisted raw.
269        _ => "related",
270    }
271    .to_string()
272}
273
274/// Emits a `tracing::warn!` when the relation is not in [`CANONICAL_RELATIONS`].
275pub fn warn_if_non_canonical(relation: &str) {
276    if !is_canonical_relation(relation) {
277        tracing::warn!(target: "parsers",
278            relation,
279            "non-canonical relation accepted; consider using a well-known value"
280        );
281    }
282}
283
284/// Clap `value_parser` for `--relation`: normalizes and validates format.
285///
286/// Accepts any kebab-case or snake_case string. Non-canonical values are
287/// accepted at parse time; the warning is emitted at command execution.
288pub fn parse_relation(s: &str) -> Result<String, String> {
289    let normalized = normalize_relation(s);
290    validate_relation_format(&normalized)?;
291    Ok(normalized)
292}
293
294#[cfg(test)]
295mod relation_tests {
296    use super::*;
297
298    #[test]
299    fn canonical_relations_all_valid() {
300        for r in CANONICAL_RELATIONS {
301            assert!(
302                validate_relation_format(r).is_ok(),
303                "canonical relation '{r}' should be valid"
304            );
305        }
306    }
307
308    #[test]
309    fn normalize_converts_hyphens_and_uppercase() {
310        assert_eq!(normalize_relation("Depends-On"), "depends_on");
311        assert_eq!(normalize_relation("TESTED-BY"), "tested_by");
312        assert_eq!(normalize_relation("uses"), "uses");
313    }
314
315    #[test]
316    fn validate_rejects_empty() {
317        assert!(validate_relation_format("").is_err());
318    }
319
320    #[test]
321    fn validate_rejects_digit_start() {
322        assert!(validate_relation_format("123abc").is_err());
323    }
324
325    #[test]
326    fn validate_rejects_spaces() {
327        assert!(validate_relation_format("has spaces").is_err());
328    }
329
330    #[test]
331    fn validate_accepts_custom_relations() {
332        assert!(validate_relation_format("implements").is_ok());
333        assert!(validate_relation_format("tested_by").is_ok());
334        assert!(validate_relation_format("part_of").is_ok());
335        assert!(validate_relation_format("blocks").is_ok());
336    }
337
338    #[test]
339    fn parse_relation_normalizes_and_validates() {
340        assert_eq!(parse_relation("Tested-By").unwrap(), "tested_by");
341        assert_eq!(parse_relation("uses").unwrap(), "uses");
342        assert!(parse_relation("").is_err());
343    }
344
345    #[test]
346    fn is_canonical_detects_known() {
347        assert!(is_canonical_relation("uses"));
348        assert!(is_canonical_relation("applies_to"));
349        assert!(!is_canonical_relation("implements"));
350        assert!(!is_canonical_relation("blocks"));
351    }
352
353    #[test]
354    fn map_to_canonical_relation_passes_through_canonical() {
355        assert_eq!(map_to_canonical_relation("uses"), "uses");
356        assert_eq!(map_to_canonical_relation("Applies-To"), "applies_to");
357        assert_eq!(map_to_canonical_relation("DEPENDS_ON"), "depends_on");
358    }
359
360    #[test]
361    fn map_to_canonical_relation_rewrites_known_aliases() {
362        // GAP-SG-48: part-of was previously accepted raw with only a WARN.
363        assert_eq!(map_to_canonical_relation("part-of"), "applies_to");
364        assert_eq!(map_to_canonical_relation("part_of"), "applies_to");
365        assert_eq!(map_to_canonical_relation("implements"), "supports");
366        assert_eq!(map_to_canonical_relation("blocks"), "contradicts");
367        assert_eq!(map_to_canonical_relation("adds"), "causes");
368        assert_eq!(map_to_canonical_relation("creates"), "causes");
369        assert_eq!(map_to_canonical_relation("tested-by"), "related");
370    }
371
372    #[test]
373    fn map_to_canonical_relation_unknown_folds_to_related() {
374        assert_eq!(map_to_canonical_relation("some-weird-relation"), "related");
375        // Output is always itself canonical.
376        assert!(is_canonical_relation(&map_to_canonical_relation("xyz")));
377    }
378}
379
380#[cfg(test)]
381mod entity_name_tests {
382    use super::*;
383
384    #[test]
385    fn strips_diacritics_from_accented_name() {
386        assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
387    }
388
389    #[test]
390    fn strips_diacritics_unicode_accents() {
391        // é → e, ã → a, ç → c
392        assert_eq!(normalize_entity_name("São Paulo"), "sao-paulo");
393        assert_eq!(normalize_entity_name("Ünit Tëst"), "unit-test");
394    }
395
396    #[test]
397    fn converts_spaces_to_hyphens() {
398        assert_eq!(normalize_entity_name("hello world"), "hello-world");
399        assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
400    }
401
402    #[test]
403    fn converts_underscores_to_hyphens() {
404        assert_eq!(normalize_entity_name("hello_world"), "hello-world");
405        assert_eq!(
406            normalize_entity_name("CANONICAL_RELATIONS"),
407            "canonical-relations"
408        );
409    }
410
411    #[test]
412    fn all_caps_becomes_lowercase_kebab() {
413        assert_eq!(
414            normalize_entity_name("CANONICAL_RELATIONS"),
415            "canonical-relations"
416        );
417        assert_eq!(normalize_entity_name("MY_ENTITY_NAME"), "my-entity-name");
418    }
419
420    #[test]
421    fn idempotent_on_already_normalized() {
422        let name = "danilo-aguiar";
423        assert_eq!(normalize_entity_name(name), name);
424        let name2 = "canonical-relations";
425        assert_eq!(normalize_entity_name(name2), name2);
426    }
427
428    #[test]
429    fn collapses_consecutive_hyphens() {
430        assert_eq!(normalize_entity_name("foo--bar"), "foo-bar");
431        assert_eq!(normalize_entity_name("foo - bar"), "foo-bar");
432    }
433
434    #[test]
435    fn trims_leading_trailing_hyphens() {
436        assert_eq!(normalize_entity_name("-foo-"), "foo");
437        assert_eq!(normalize_entity_name("--hello--"), "hello");
438    }
439
440    #[test]
441    fn empty_or_only_separators_returns_empty() {
442        assert_eq!(normalize_entity_name(""), "");
443        assert_eq!(normalize_entity_name("---"), "");
444    }
445
446    #[test]
447    fn normalizes_dots_slashes_and_punctuation() {
448        assert_eq!(normalize_entity_name("lei-14.478/2022"), "lei-14-478-2022");
449        assert_eq!(normalize_entity_name("src/main.rs"), "src-main-rs");
450        assert_eq!(normalize_entity_name("user@domain.com"), "user-domain-com");
451        assert_eq!(normalize_entity_name("v1.0.66"), "v1-0-66");
452        assert_eq!(normalize_entity_name("key:value"), "key-value");
453    }
454}