Skip to main content

sqlite_graphrag/parsers/
mod.rs

1//! Input format parsers (timestamp, range validators).
2
3use chrono::DateTime;
4use unicode_normalization::UnicodeNormalization;
5
6/// Accepts a Unix epoch (integer >= 0) or RFC 3339 timestamp and returns the Unix epoch.
7pub fn parse_expected_updated_at(s: &str) -> Result<i64, String> {
8    if let Ok(secs) = s.parse::<i64>() {
9        if secs >= 0 {
10            return Ok(secs);
11        }
12    }
13    DateTime::parse_from_rfc3339(s)
14        .map(|dt| dt.timestamp())
15        .map_err(|e| {
16            format!(
17                "value must be a Unix epoch (integer >= 0) or RFC 3339 (e.g. 2026-04-19T12:00:00Z): {e}"
18            )
19        })
20}
21
22/// Validates `-k`/`--k` for `recall` and `hybrid-search` to the inclusive range `1..=4096`.
23///
24/// The upper bound matches the `sqlite-vec` knn limit; values above it would surface a leaky
25/// engine error such as `k value in knn query too large, provided 10000 and the limit is 4096`.
26/// Validating at parse time turns the failure into a clean Clap error before any database work.
27pub fn parse_k_range(s: &str) -> Result<usize, String> {
28    let value: usize = s
29        .parse()
30        .map_err(|_| format!("'{s}' is not a valid non-negative integer"))?;
31    if !(1..=4096).contains(&value) {
32        return Err(format!(
33            "k must be between 1 and 4096 (inclusive); got {value}"
34        ));
35    }
36    Ok(value)
37}
38
39/// Flexible boolean parser for Clap env var integration.
40///
41/// Accepts common truthy/falsy conventions used in shell environments:
42/// truthy: `1`, `true`, `yes`, `on` (case-insensitive)
43/// falsy: `0`, `false`, `no`, `off`, empty string (case-insensitive)
44pub fn parse_bool_flexible(s: &str) -> Result<bool, String> {
45    match s.to_lowercase().as_str() {
46        "1" | "true" | "yes" | "on" => Ok(true),
47        "0" | "false" | "no" | "off" | "" => Ok(false),
48        _ => Err(format!(
49            "invalid boolean value '{s}': expected true/false/1/0/yes/no/on/off"
50        )),
51    }
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn accepts_unix_epoch() {
60        assert_eq!(parse_expected_updated_at("1700000000").unwrap(), 1700000000);
61    }
62
63    #[test]
64    fn accepts_zero() {
65        assert_eq!(parse_expected_updated_at("0").unwrap(), 0);
66    }
67
68    #[test]
69    fn accepts_rfc_3339_utc() {
70        let result = parse_expected_updated_at("2020-01-01T00:00:00Z");
71        assert!(result.is_ok());
72        assert_eq!(result.unwrap(), 1577836800);
73    }
74
75    #[test]
76    fn accepts_rfc_3339_with_offset() {
77        let result = parse_expected_updated_at("2026-04-19T12:00:00+00:00");
78        assert!(result.is_ok());
79    }
80
81    #[test]
82    fn rejects_invalid_string() {
83        assert!(parse_expected_updated_at("bananas").is_err());
84    }
85
86    #[test]
87    fn rejects_negative() {
88        let err = parse_expected_updated_at("-1");
89        assert!(err.is_err());
90    }
91
92    #[test]
93    fn error_message_mentions_format() {
94        let msg = parse_expected_updated_at("invalid").unwrap_err();
95        assert!(msg.contains("RFC 3339") || msg.contains("Unix epoch"));
96    }
97
98    #[test]
99    fn k_accepts_valid_range_endpoints() {
100        assert_eq!(parse_k_range("1").unwrap(), 1);
101        assert_eq!(parse_k_range("4096").unwrap(), 4096);
102        assert_eq!(parse_k_range("10").unwrap(), 10);
103    }
104
105    #[test]
106    fn k_rejects_zero() {
107        let msg = parse_k_range("0").unwrap_err();
108        assert!(msg.contains("between 1 and 4096"));
109    }
110
111    #[test]
112    fn k_rejects_above_limit() {
113        let msg = parse_k_range("10000").unwrap_err();
114        assert!(msg.contains("between 1 and 4096"));
115    }
116
117    #[test]
118    fn k_rejects_non_integer() {
119        let msg = parse_k_range("abc").unwrap_err();
120        assert!(msg.contains("not a valid"));
121    }
122
123    #[test]
124    fn k_rejects_negative() {
125        // usize parser fails on negatives before range check
126        assert!(parse_k_range("-5").is_err());
127    }
128
129    #[test]
130    fn bool_flexible_truthy() {
131        for v in &["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
132            assert!(parse_bool_flexible(v).unwrap(), "should be true: {v}");
133        }
134    }
135
136    #[test]
137    fn bool_flexible_falsy() {
138        for v in &["0", "false", "False", "FALSE", "no", "No", "off", "OFF", ""] {
139            assert!(!parse_bool_flexible(v).unwrap(), "should be false: {v}");
140        }
141    }
142
143    #[test]
144    fn bool_flexible_rejects_invalid() {
145        assert!(parse_bool_flexible("banana").is_err());
146        assert!(parse_bool_flexible("2").is_err());
147        assert!(parse_bool_flexible("nope").is_err());
148    }
149}
150
151/// The 12 well-known relation types from v1.0.0.
152///
153/// Non-canonical relations are accepted but emit a `tracing::warn!`.
154pub const CANONICAL_RELATIONS: &[&str] = &[
155    "applies_to",
156    "uses",
157    "depends_on",
158    "causes",
159    "fixes",
160    "contradicts",
161    "supports",
162    "follows",
163    "related",
164    "mentions",
165    "replaces",
166    "tracked_in",
167];
168
169/// Returns `true` when the relation is one of the 12 canonical types.
170pub fn is_canonical_relation(s: &str) -> bool {
171    CANONICAL_RELATIONS.contains(&s)
172}
173
174/// Normalizes a relation string: lowercase + hyphens to underscores.
175pub fn normalize_relation(s: &str) -> String {
176    s.to_lowercase().replace('-', "_")
177}
178
179/// Normalizes an entity name to kebab-case ASCII.
180///
181/// Applies NFKD decomposition, filters to ASCII (transliterating by dropping
182/// diacritical combining marks), lowercases, converts spaces and underscores
183/// to hyphens, collapses consecutive hyphens, and trims leading/trailing hyphens.
184///
185/// # Examples
186///
187/// ```
188/// use sqlite_graphrag::parsers::normalize_entity_name;
189///
190/// assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
191/// assert_eq!(normalize_entity_name("CANONICAL_RELATIONS"), "canonical-relations");
192/// assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
193/// assert_eq!(normalize_entity_name("danilo-aguiar"), "danilo-aguiar"); // idempotent
194/// ```
195pub fn normalize_entity_name(s: &str) -> String {
196    // NFKD: decompose precomposed characters into base + combining marks.
197    // Then keep only ASCII characters, effectively stripping diacritics.
198    let ascii: String = s.nfkd().filter(|c| c.is_ascii()).collect();
199    // Lowercase, then replace spaces and underscores with hyphens.
200    let hyphenated: String = ascii
201        .to_lowercase()
202        .chars()
203        .map(|c| if c.is_ascii_alphanumeric() { c } else { '-' })
204        .collect();
205    // Collapse consecutive hyphens and trim from both ends.
206    let mut result = String::with_capacity(hyphenated.len());
207    let mut prev_was_hyphen = false;
208    for ch in hyphenated.chars() {
209        if ch == '-' {
210            if !prev_was_hyphen {
211                result.push('-');
212            }
213            prev_was_hyphen = true;
214        } else {
215            result.push(ch);
216            prev_was_hyphen = false;
217        }
218    }
219    result.trim_matches('-').to_string()
220}
221
222/// Validates that a normalized relation matches `^[a-z][a-z0-9_]*$`.
223pub fn validate_relation_format(s: &str) -> Result<(), String> {
224    if s.is_empty() {
225        return Err("relation must not be empty".to_string());
226    }
227    if !s.as_bytes()[0].is_ascii_lowercase() {
228        return Err(format!(
229            "relation must start with a lowercase letter, got '{s}'"
230        ));
231    }
232    if !s
233        .bytes()
234        .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_')
235    {
236        return Err(format!(
237            "relation must contain only lowercase letters, digits and underscores, got '{s}'"
238        ));
239    }
240    Ok(())
241}
242
243/// Emits a `tracing::warn!` when the relation is not in [`CANONICAL_RELATIONS`].
244pub fn warn_if_non_canonical(relation: &str) {
245    if !is_canonical_relation(relation) {
246        tracing::warn!(target: "parsers",
247            relation,
248            "non-canonical relation accepted; consider using a well-known value"
249        );
250    }
251}
252
253/// Clap `value_parser` for `--relation`: normalizes and validates format.
254///
255/// Accepts any kebab-case or snake_case string. Non-canonical values are
256/// accepted at parse time; the warning is emitted at command execution.
257pub fn parse_relation(s: &str) -> Result<String, String> {
258    let normalized = normalize_relation(s);
259    validate_relation_format(&normalized)?;
260    Ok(normalized)
261}
262
263#[cfg(test)]
264mod relation_tests {
265    use super::*;
266
267    #[test]
268    fn canonical_relations_all_valid() {
269        for r in CANONICAL_RELATIONS {
270            assert!(
271                validate_relation_format(r).is_ok(),
272                "canonical relation '{r}' should be valid"
273            );
274        }
275    }
276
277    #[test]
278    fn normalize_converts_hyphens_and_uppercase() {
279        assert_eq!(normalize_relation("Depends-On"), "depends_on");
280        assert_eq!(normalize_relation("TESTED-BY"), "tested_by");
281        assert_eq!(normalize_relation("uses"), "uses");
282    }
283
284    #[test]
285    fn validate_rejects_empty() {
286        assert!(validate_relation_format("").is_err());
287    }
288
289    #[test]
290    fn validate_rejects_digit_start() {
291        assert!(validate_relation_format("123abc").is_err());
292    }
293
294    #[test]
295    fn validate_rejects_spaces() {
296        assert!(validate_relation_format("has spaces").is_err());
297    }
298
299    #[test]
300    fn validate_accepts_custom_relations() {
301        assert!(validate_relation_format("implements").is_ok());
302        assert!(validate_relation_format("tested_by").is_ok());
303        assert!(validate_relation_format("part_of").is_ok());
304        assert!(validate_relation_format("blocks").is_ok());
305    }
306
307    #[test]
308    fn parse_relation_normalizes_and_validates() {
309        assert_eq!(parse_relation("Tested-By").unwrap(), "tested_by");
310        assert_eq!(parse_relation("uses").unwrap(), "uses");
311        assert!(parse_relation("").is_err());
312    }
313
314    #[test]
315    fn is_canonical_detects_known() {
316        assert!(is_canonical_relation("uses"));
317        assert!(is_canonical_relation("applies_to"));
318        assert!(!is_canonical_relation("implements"));
319        assert!(!is_canonical_relation("blocks"));
320    }
321}
322
323#[cfg(test)]
324mod entity_name_tests {
325    use super::*;
326
327    #[test]
328    fn strips_diacritics_from_accented_name() {
329        assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
330    }
331
332    #[test]
333    fn strips_diacritics_unicode_accents() {
334        // é → e, ã → a, ç → c
335        assert_eq!(normalize_entity_name("São Paulo"), "sao-paulo");
336        assert_eq!(normalize_entity_name("Ünit Tëst"), "unit-test");
337    }
338
339    #[test]
340    fn converts_spaces_to_hyphens() {
341        assert_eq!(normalize_entity_name("hello world"), "hello-world");
342        assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
343    }
344
345    #[test]
346    fn converts_underscores_to_hyphens() {
347        assert_eq!(normalize_entity_name("hello_world"), "hello-world");
348        assert_eq!(
349            normalize_entity_name("CANONICAL_RELATIONS"),
350            "canonical-relations"
351        );
352    }
353
354    #[test]
355    fn all_caps_becomes_lowercase_kebab() {
356        assert_eq!(
357            normalize_entity_name("CANONICAL_RELATIONS"),
358            "canonical-relations"
359        );
360        assert_eq!(normalize_entity_name("MY_ENTITY_NAME"), "my-entity-name");
361    }
362
363    #[test]
364    fn idempotent_on_already_normalized() {
365        let name = "danilo-aguiar";
366        assert_eq!(normalize_entity_name(name), name);
367        let name2 = "canonical-relations";
368        assert_eq!(normalize_entity_name(name2), name2);
369    }
370
371    #[test]
372    fn collapses_consecutive_hyphens() {
373        assert_eq!(normalize_entity_name("foo--bar"), "foo-bar");
374        assert_eq!(normalize_entity_name("foo - bar"), "foo-bar");
375    }
376
377    #[test]
378    fn trims_leading_trailing_hyphens() {
379        assert_eq!(normalize_entity_name("-foo-"), "foo");
380        assert_eq!(normalize_entity_name("--hello--"), "hello");
381    }
382
383    #[test]
384    fn empty_or_only_separators_returns_empty() {
385        assert_eq!(normalize_entity_name(""), "");
386        assert_eq!(normalize_entity_name("---"), "");
387    }
388
389    #[test]
390    fn normalizes_dots_slashes_and_punctuation() {
391        assert_eq!(normalize_entity_name("lei-14.478/2022"), "lei-14-478-2022");
392        assert_eq!(normalize_entity_name("src/main.rs"), "src-main-rs");
393        assert_eq!(normalize_entity_name("user@domain.com"), "user-domain-com");
394        assert_eq!(normalize_entity_name("v1.0.66"), "v1-0-66");
395        assert_eq!(normalize_entity_name("key:value"), "key-value");
396    }
397}