Skip to main content

sqlite_graphrag/parsers/
mod.rs

1//! Input format parsers (timestamp, range validators).
2
3use chrono::DateTime;
4use unicode_normalization::UnicodeNormalization;
5
6/// Accepts a Unix epoch (integer >= 0) or RFC 3339 timestamp and returns the Unix epoch.
7pub fn parse_expected_updated_at(s: &str) -> Result<i64, String> {
8    if let Ok(secs) = s.parse::<i64>() {
9        if secs >= 0 {
10            return Ok(secs);
11        }
12    }
13    DateTime::parse_from_rfc3339(s)
14        .map(|dt| dt.timestamp())
15        .map_err(|e| {
16            format!(
17                "value must be a Unix epoch (integer >= 0) or RFC 3339 (e.g. 2026-04-19T12:00:00Z): {e}"
18            )
19        })
20}
21
22/// Validates `-k`/`--k` for `recall` and `hybrid-search` to the inclusive range `1..=4096`.
23///
24/// The upper bound matches the `sqlite-vec` knn limit; values above it would surface a leaky
25/// engine error such as `k value in knn query too large, provided 10000 and the limit is 4096`.
26/// Validating at parse time turns the failure into a clean Clap error before any database work.
27pub fn parse_k_range(s: &str) -> Result<usize, String> {
28    let value: usize = s
29        .parse()
30        .map_err(|_| format!("'{s}' is not a valid non-negative integer"))?;
31    if !(1..=4096).contains(&value) {
32        return Err(format!(
33            "k must be between 1 and 4096 (inclusive); got {value}"
34        ));
35    }
36    Ok(value)
37}
38
39/// Flexible boolean parser for Clap env var integration.
40///
41/// Accepts common truthy/falsy conventions used in shell environments:
42/// truthy: `1`, `true`, `yes`, `on` (case-insensitive)
43/// falsy: `0`, `false`, `no`, `off`, empty string (case-insensitive)
44pub fn parse_bool_flexible(s: &str) -> Result<bool, String> {
45    match s.to_lowercase().as_str() {
46        "1" | "true" | "yes" | "on" => Ok(true),
47        "0" | "false" | "no" | "off" | "" => Ok(false),
48        _ => Err(format!(
49            "invalid boolean value '{s}': expected true/false/1/0/yes/no/on/off"
50        )),
51    }
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn accepts_unix_epoch() {
60        assert_eq!(parse_expected_updated_at("1700000000").unwrap(), 1700000000);
61    }
62
63    #[test]
64    fn accepts_zero() {
65        assert_eq!(parse_expected_updated_at("0").unwrap(), 0);
66    }
67
68    #[test]
69    fn accepts_rfc_3339_utc() {
70        let result = parse_expected_updated_at("2020-01-01T00:00:00Z");
71        assert!(result.is_ok());
72        assert_eq!(result.unwrap(), 1577836800);
73    }
74
75    #[test]
76    fn accepts_rfc_3339_with_offset() {
77        let result = parse_expected_updated_at("2026-04-19T12:00:00+00:00");
78        assert!(result.is_ok());
79    }
80
81    #[test]
82    fn rejects_invalid_string() {
83        assert!(parse_expected_updated_at("bananas").is_err());
84    }
85
86    #[test]
87    fn rejects_negative() {
88        let err = parse_expected_updated_at("-1");
89        assert!(err.is_err());
90    }
91
92    #[test]
93    fn error_message_mentions_format() {
94        let msg = parse_expected_updated_at("invalid").unwrap_err();
95        assert!(msg.contains("RFC 3339") || msg.contains("Unix epoch"));
96    }
97
98    #[test]
99    fn k_accepts_valid_range_endpoints() {
100        assert_eq!(parse_k_range("1").unwrap(), 1);
101        assert_eq!(parse_k_range("4096").unwrap(), 4096);
102        assert_eq!(parse_k_range("10").unwrap(), 10);
103    }
104
105    #[test]
106    fn k_rejects_zero() {
107        let msg = parse_k_range("0").unwrap_err();
108        assert!(msg.contains("between 1 and 4096"));
109    }
110
111    #[test]
112    fn k_rejects_above_limit() {
113        let msg = parse_k_range("10000").unwrap_err();
114        assert!(msg.contains("between 1 and 4096"));
115    }
116
117    #[test]
118    fn k_rejects_non_integer() {
119        let msg = parse_k_range("abc").unwrap_err();
120        assert!(msg.contains("not a valid"));
121    }
122
123    #[test]
124    fn k_rejects_negative() {
125        // usize parser fails on negatives before range check
126        assert!(parse_k_range("-5").is_err());
127    }
128
129    #[test]
130    fn bool_flexible_truthy() {
131        for v in &["1", "true", "True", "TRUE", "yes", "Yes", "on", "ON"] {
132            assert!(parse_bool_flexible(v).unwrap(), "should be true: {v}");
133        }
134    }
135
136    #[test]
137    fn bool_flexible_falsy() {
138        for v in &["0", "false", "False", "FALSE", "no", "No", "off", "OFF", ""] {
139            assert!(!parse_bool_flexible(v).unwrap(), "should be false: {v}");
140        }
141    }
142
143    #[test]
144    fn bool_flexible_rejects_invalid() {
145        assert!(parse_bool_flexible("banana").is_err());
146        assert!(parse_bool_flexible("2").is_err());
147        assert!(parse_bool_flexible("nope").is_err());
148    }
149}
150
151/// The 12 well-known relation types from v1.0.0.
152///
153/// Non-canonical relations are accepted but emit a `tracing::warn!`.
154pub const CANONICAL_RELATIONS: &[&str] = &[
155    "applies_to",
156    "uses",
157    "depends_on",
158    "causes",
159    "fixes",
160    "contradicts",
161    "supports",
162    "follows",
163    "related",
164    "mentions",
165    "replaces",
166    "tracked_in",
167];
168
169/// Returns `true` when the relation is one of the 12 canonical types.
170pub fn is_canonical_relation(s: &str) -> bool {
171    CANONICAL_RELATIONS.contains(&s)
172}
173
174/// Normalizes a relation string: lowercase + hyphens to underscores.
175pub fn normalize_relation(s: &str) -> String {
176    s.to_lowercase().replace('-', "_")
177}
178
179/// Normalizes an entity name to kebab-case ASCII.
180///
181/// Applies NFKD decomposition, filters to ASCII (transliterating by dropping
182/// diacritical combining marks), lowercases, converts spaces and underscores
183/// to hyphens, collapses consecutive hyphens, and trims leading/trailing hyphens.
184///
185/// # Examples
186///
187/// ```
188/// use sqlite_graphrag::parsers::normalize_entity_name;
189///
190/// assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
191/// assert_eq!(normalize_entity_name("CANONICAL_RELATIONS"), "canonical-relations");
192/// assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
193/// assert_eq!(normalize_entity_name("danilo-aguiar"), "danilo-aguiar"); // idempotent
194/// ```
195pub fn normalize_entity_name(s: &str) -> String {
196    // NFKD: decompose precomposed characters into base + combining marks.
197    // Then keep only ASCII characters, effectively stripping diacritics.
198    let ascii: String = s.nfkd().filter(|c| c.is_ascii()).collect();
199    // Lowercase, then replace spaces and underscores with hyphens.
200    let hyphenated = ascii.to_lowercase().replace([' ', '_'], "-");
201    // Collapse consecutive hyphens and trim from both ends.
202    let mut result = String::with_capacity(hyphenated.len());
203    let mut prev_was_hyphen = false;
204    for ch in hyphenated.chars() {
205        if ch == '-' {
206            if !prev_was_hyphen {
207                result.push('-');
208            }
209            prev_was_hyphen = true;
210        } else {
211            result.push(ch);
212            prev_was_hyphen = false;
213        }
214    }
215    result.trim_matches('-').to_string()
216}
217
218/// Validates that a normalized relation matches `^[a-z][a-z0-9_]*$`.
219pub fn validate_relation_format(s: &str) -> Result<(), String> {
220    if s.is_empty() {
221        return Err("relation must not be empty".to_string());
222    }
223    if !s.as_bytes()[0].is_ascii_lowercase() {
224        return Err(format!(
225            "relation must start with a lowercase letter, got '{s}'"
226        ));
227    }
228    if !s
229        .bytes()
230        .all(|b| b.is_ascii_lowercase() || b.is_ascii_digit() || b == b'_')
231    {
232        return Err(format!(
233            "relation must contain only lowercase letters, digits and underscores, got '{s}'"
234        ));
235    }
236    Ok(())
237}
238
239/// Emits a `tracing::warn!` when the relation is not in [`CANONICAL_RELATIONS`].
240pub fn warn_if_non_canonical(relation: &str) {
241    if !is_canonical_relation(relation) {
242        tracing::warn!(
243            relation,
244            "non-canonical relation accepted; consider using a well-known value"
245        );
246    }
247}
248
249/// Clap `value_parser` for `--relation`: normalizes and validates format.
250///
251/// Accepts any kebab-case or snake_case string. Non-canonical values are
252/// accepted at parse time; the warning is emitted at command execution.
253pub fn parse_relation(s: &str) -> Result<String, String> {
254    let normalized = normalize_relation(s);
255    validate_relation_format(&normalized)?;
256    Ok(normalized)
257}
258
259#[cfg(test)]
260mod relation_tests {
261    use super::*;
262
263    #[test]
264    fn canonical_relations_all_valid() {
265        for r in CANONICAL_RELATIONS {
266            assert!(
267                validate_relation_format(r).is_ok(),
268                "canonical relation '{r}' should be valid"
269            );
270        }
271    }
272
273    #[test]
274    fn normalize_converts_hyphens_and_uppercase() {
275        assert_eq!(normalize_relation("Depends-On"), "depends_on");
276        assert_eq!(normalize_relation("TESTED-BY"), "tested_by");
277        assert_eq!(normalize_relation("uses"), "uses");
278    }
279
280    #[test]
281    fn validate_rejects_empty() {
282        assert!(validate_relation_format("").is_err());
283    }
284
285    #[test]
286    fn validate_rejects_digit_start() {
287        assert!(validate_relation_format("123abc").is_err());
288    }
289
290    #[test]
291    fn validate_rejects_spaces() {
292        assert!(validate_relation_format("has spaces").is_err());
293    }
294
295    #[test]
296    fn validate_accepts_custom_relations() {
297        assert!(validate_relation_format("implements").is_ok());
298        assert!(validate_relation_format("tested_by").is_ok());
299        assert!(validate_relation_format("part_of").is_ok());
300        assert!(validate_relation_format("blocks").is_ok());
301    }
302
303    #[test]
304    fn parse_relation_normalizes_and_validates() {
305        assert_eq!(parse_relation("Tested-By").unwrap(), "tested_by");
306        assert_eq!(parse_relation("uses").unwrap(), "uses");
307        assert!(parse_relation("").is_err());
308    }
309
310    #[test]
311    fn is_canonical_detects_known() {
312        assert!(is_canonical_relation("uses"));
313        assert!(is_canonical_relation("applies_to"));
314        assert!(!is_canonical_relation("implements"));
315        assert!(!is_canonical_relation("blocks"));
316    }
317}
318
319#[cfg(test)]
320mod entity_name_tests {
321    use super::*;
322
323    #[test]
324    fn strips_diacritics_from_accented_name() {
325        assert_eq!(normalize_entity_name("Danilo Aguiar"), "danilo-aguiar");
326    }
327
328    #[test]
329    fn strips_diacritics_unicode_accents() {
330        // é → e, ã → a, ç → c
331        assert_eq!(normalize_entity_name("São Paulo"), "sao-paulo");
332        assert_eq!(normalize_entity_name("Ünit Tëst"), "unit-test");
333    }
334
335    #[test]
336    fn converts_spaces_to_hyphens() {
337        assert_eq!(normalize_entity_name("hello world"), "hello-world");
338        assert_eq!(normalize_entity_name("  hello  world  "), "hello-world");
339    }
340
341    #[test]
342    fn converts_underscores_to_hyphens() {
343        assert_eq!(normalize_entity_name("hello_world"), "hello-world");
344        assert_eq!(
345            normalize_entity_name("CANONICAL_RELATIONS"),
346            "canonical-relations"
347        );
348    }
349
350    #[test]
351    fn all_caps_becomes_lowercase_kebab() {
352        assert_eq!(
353            normalize_entity_name("CANONICAL_RELATIONS"),
354            "canonical-relations"
355        );
356        assert_eq!(normalize_entity_name("MY_ENTITY_NAME"), "my-entity-name");
357    }
358
359    #[test]
360    fn idempotent_on_already_normalized() {
361        let name = "danilo-aguiar";
362        assert_eq!(normalize_entity_name(name), name);
363        let name2 = "canonical-relations";
364        assert_eq!(normalize_entity_name(name2), name2);
365    }
366
367    #[test]
368    fn collapses_consecutive_hyphens() {
369        assert_eq!(normalize_entity_name("foo--bar"), "foo-bar");
370        assert_eq!(normalize_entity_name("foo - bar"), "foo-bar");
371    }
372
373    #[test]
374    fn trims_leading_trailing_hyphens() {
375        assert_eq!(normalize_entity_name("-foo-"), "foo");
376        assert_eq!(normalize_entity_name("--hello--"), "hello");
377    }
378
379    #[test]
380    fn empty_or_only_separators_returns_empty() {
381        assert_eq!(normalize_entity_name(""), "");
382        assert_eq!(normalize_entity_name("---"), "");
383    }
384}