kglite 0.10.22

Pure-Rust knowledge graph engine — Cypher pipeline, snapshot/working CoW transactions, columnar/mmap/disk storage backends, optional dataset loaders (SEC EDGAR, Sodir, Wikidata). PyO3 wrappers live in the sibling kglite-py crate (the Python wheel); embeddable directly from any Rust binary without PyO3 in the dep tree.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
//! N-Triples parser — line-level parsing and literal type inference.
//!
//! Triple AST (`Subject` / `Predicate` / `Object` / `EdgeBuffer`),
//! XSD literal → `Value` coercion, Wikidata Q-code helpers, and
//! language-filter utilities.

use crate::datatypes::values::Value;
use crate::graph::schema::InternedKey;
use crate::graph::storage::mapped::mmap_vec::MmapOrVec;
use std::collections::{HashMap, HashSet};

// ─── Parsed triple ──────────────────────────────────────────────────────────

/// The subject of a triple (only Wikidata Q-entities are interesting).
/// Borrows from the input line for zero-copy parsing.
pub(super) enum Subject<'a> {
    Entity(&'a str), // Q-code slice, e.g. "Q42"
    Other,
}

/// The predicate of a triple. Borrows P-code from input line.
pub(super) enum Predicate<'a> {
    WikidataDirect(&'a str), // P-code slice, e.g. "P31"
    Label,
    Description,
    AltLabel,
    Type,
    Other,
}

/// The object of a triple.
/// Entity and typed/lang metadata borrow from input line.
/// Literal values are owned (may contain escape sequences).
pub(super) enum Object<'a> {
    Entity(&'a str),               // Q-code slice
    Literal(String),               // owned (escape-processed)
    LangLiteral(String, &'a str),  // (owned value, borrowed lang)
    TypedLiteral(String, &'a str), // (owned value, borrowed type_uri)
    Other,
}

// ─── Line batch (cache-friendly transport from reader → loader) ────────────

/// A batch of N-triples lines packed into one contiguous buffer plus an
/// offset table. Replaces `Vec<String>` for the reader-thread → loader
/// channel: one allocation per batch instead of 200k separately-heap-
/// allocated `String` objects, and the loader iterates via byte-offset
/// math instead of pointer-chasing scattered heap addresses.
///
/// Each entry occupies `data[offsets[i]..offsets[i + 1]]` (or
/// `data[offsets.last()..data.len()]` for the final line). Lines retain
/// their trailing `\n`; `parse_line` already trims it.
pub(super) struct LineBuffer {
    pub(super) data: Vec<u8>,
    pub(super) offsets: Vec<u32>,
}

impl LineBuffer {
    pub(super) fn with_capacity(line_cap: usize, byte_cap: usize) -> Self {
        Self {
            data: Vec::with_capacity(byte_cap),
            offsets: Vec::with_capacity(line_cap),
        }
    }

    pub(super) fn is_empty(&self) -> bool {
        self.offsets.is_empty()
    }

    pub(super) fn push_line(&mut self, line: &[u8]) {
        let start = self.data.len() as u32;
        self.data.extend_from_slice(line);
        self.offsets.push(start);
    }

    /// Byte slice of line `i`. Caller must ensure `i < self.offsets.len()`.
    #[inline]
    pub(super) fn line(&self, i: usize) -> &[u8] {
        let start = self.offsets[i] as usize;
        let end = if i + 1 < self.offsets.len() {
            self.offsets[i + 1] as usize
        } else {
            self.data.len()
        };
        &self.data[start..end]
    }
}

// ─── Edge buffer ────────────────────────────────────────────────────────────

/// Edge buffer: String-based (default mode) or compact u32-based (mapped mode).
pub(super) enum EdgeBuffer {
    /// Default mode: (source_qcode, target_qcode, predicate_label) — ~80 bytes each
    Strings(Vec<(String, String, String)>),
    /// Mapped mode: (source_qnum, target_qnum, interned_predicate) — 16 bytes each.
    /// File-backed (MmapOrVec) in disk mode to avoid holding ~14 GB in RAM.
    Compact(MmapOrVec<(u32, u32, InternedKey)>),
}

impl EdgeBuffer {
    pub(super) fn len(&self) -> usize {
        match self {
            Self::Strings(v) => v.len(),
            Self::Compact(v) => v.len(),
        }
    }
}

/// Parse "Q42" → 42. Returns None for non-Q-code or overflow.
pub(super) fn parse_qcode_number(qcode: &str) -> Option<u32> {
    qcode.strip_prefix('Q')?.parse::<u32>().ok()
}

// ─── Accumulated entity ─────────────────────────────────────────────────────

/// Collects all triples for one subject entity before flushing to graph.
pub(super) struct EntityAccumulator {
    pub(super) id: String, // Q-code
    pub(super) label: Option<String>,
    pub(super) description: Option<String>,
    pub(super) type_qcode: Option<String>, // P31 target, e.g. "Q5"
    pub(super) properties: HashMap<String, Value>,
    pub(super) outgoing_edges: Vec<(String, String)>, // (predicate, target Q-code)
}

impl EntityAccumulator {
    pub(super) fn new(id: String) -> Self {
        // Preallocate typical Wikidata entity sizes — most have 10-30
        // properties and a handful of outgoing edges. Avoids the
        // `RawVecInner::finish_grow` reallocs the loader profile showed
        // at ~2% of total CPU.
        Self {
            id,
            label: None,
            description: None,
            type_qcode: None,
            properties: HashMap::with_capacity(32),
            outgoing_edges: Vec::with_capacity(8),
        }
    }
}

// ─── Line parser ────────────────────────────────────────────────────────────

const WD_ENTITY: &str = "http://www.wikidata.org/entity/";
const WD_PROP_DIRECT: &str = "http://www.wikidata.org/prop/direct/";
const RDFS_LABEL: &str = "http://www.w3.org/2000/01/rdf-schema#label";
const SCHEMA_DESC: &str = "http://schema.org/description";
const SKOS_ALT: &str = "http://www.w3.org/2004/02/skos/core#altLabel";
const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";

/// Parse a single N-Triples line into (subject, predicate, object).
/// Returns borrowed slices into the input line — zero heap allocations.
///
/// Hot-path implementation: scans at the byte level using `memchr` to
/// locate URI boundaries (`>`), avoiding `str::find("> ")`'s
/// `TwoWaySearcher` setup which showed up at ~10% of loader CPU under
/// `samply`. N-triples URIs are guaranteed not to contain `>`, so a
/// single `memchr(b'>')` is sufficient — no two-byte needle search.
pub(super) fn parse_line(line: &str) -> Option<(Subject<'_>, Predicate<'_>, Object<'_>)> {
    // Manual byte-level trim — `str::trim()` has O(n) startup overhead.
    let bytes = line.as_bytes();
    let mut start = 0;
    while start < bytes.len() && bytes[start].is_ascii_whitespace() {
        start += 1;
    }
    let mut end = bytes.len();
    while end > start && bytes[end - 1].is_ascii_whitespace() {
        end -= 1;
    }
    if start >= end {
        return None;
    }
    let bytes = &bytes[start..end];

    // Skip comments + non-URI lines fast.
    if bytes[0] == b'#' || bytes[0] != b'<' {
        return None;
    }

    // Subject URI: find closing `>` at any offset >= 1. URIs in N-triples
    // cannot contain `>`, so a single memchr is correct without the
    // two-byte "> " search the previous version used.
    let subj_end = 1 + memchr::memchr(b'>', &bytes[1..])?;
    if bytes.get(subj_end + 1) != Some(&b' ') {
        return None;
    }
    // SAFETY: we sliced from a valid &str on byte boundaries; URI text
    // is ASCII so the boundary is preserved.
    let subj_uri = unsafe { std::str::from_utf8_unchecked(&bytes[1..subj_end]) };
    let subject = if let Some(qcode) = subj_uri.strip_prefix(WD_ENTITY) {
        if qcode.starts_with('Q') {
            Subject::Entity(qcode)
        } else {
            Subject::Other
        }
    } else {
        Subject::Other
    };

    // Predicate URI starts at subj_end + 2 ("> ").
    let pred_start = subj_end + 2;
    if bytes.get(pred_start) != Some(&b'<') {
        return None;
    }
    let pred_end_rel = memchr::memchr(b'>', &bytes[pred_start + 1..])?;
    let pred_end = pred_start + 1 + pred_end_rel;
    if bytes.get(pred_end + 1) != Some(&b' ') {
        return None;
    }
    // SAFETY: `bytes` originates from an N-Triples line read as UTF-8;
    // valid byte ranges within it remain valid UTF-8.
    let pred_uri = unsafe { std::str::from_utf8_unchecked(&bytes[pred_start + 1..pred_end]) };
    let predicate = if let Some(pcode) = pred_uri.strip_prefix(WD_PROP_DIRECT) {
        if pcode.starts_with('P') {
            Predicate::WikidataDirect(pcode)
        } else {
            Predicate::Other
        }
    } else if pred_uri == RDFS_LABEL {
        Predicate::Label
    } else if pred_uri == SCHEMA_DESC {
        Predicate::Description
    } else if pred_uri == SKOS_ALT {
        Predicate::AltLabel
    } else if pred_uri == RDF_TYPE {
        Predicate::Type
    } else {
        Predicate::Other
    };

    // Object section — strip trailing whitespace + final `.`.
    let mut obj_end = bytes.len();
    while obj_end > 0 && bytes[obj_end - 1].is_ascii_whitespace() {
        obj_end -= 1;
    }
    if obj_end == 0 || bytes[obj_end - 1] != b'.' {
        return None;
    }
    obj_end -= 1;
    while obj_end > 0 && bytes[obj_end - 1].is_ascii_whitespace() {
        obj_end -= 1;
    }
    let obj_start = pred_end + 2;
    if obj_start >= obj_end {
        return None;
    }
    // The object portion may include UTF-8 inside a quoted literal, so
    // we use checked from_utf8 here (parse_object expects &str).
    let obj_str = std::str::from_utf8(&bytes[obj_start..obj_end]).ok()?;
    let object = parse_object(obj_str);

    Some((subject, predicate, object))
}

/// Parse the object portion of an N-Triples line.
/// Entity Q-codes and lang/type tags borrow from the input. Literal values are owned.
pub(super) fn parse_object<'a>(s: &'a str) -> Object<'a> {
    if s.starts_with('<') {
        let uri = s.trim_start_matches('<').trim_end_matches('>');
        if let Some(qcode) = uri.strip_prefix(WD_ENTITY) {
            if qcode.starts_with('Q') {
                return Object::Entity(qcode); // borrow, no allocation
            }
        }
        Object::Other
    } else if s.starts_with('"') {
        if let Some((value, suffix)) = extract_quoted_string(s) {
            if suffix.is_empty() {
                Object::Literal(value)
            } else if let Some(lang) = suffix.strip_prefix('@') {
                Object::LangLiteral(value, lang) // lang borrows from input
            } else if let Some(type_part) = suffix.strip_prefix("^^<") {
                let type_uri = type_part.trim_end_matches('>');
                Object::TypedLiteral(value, type_uri) // type_uri borrows
            } else {
                Object::Literal(value)
            }
        } else {
            Object::Other
        }
    } else {
        Object::Other
    }
}

/// Extract the string content from a quoted N-Triples literal,
/// returning (unescaped_value, suffix_after_closing_quote).
pub(super) fn extract_quoted_string(s: &str) -> Option<(String, &str)> {
    let s = s.strip_prefix('"')?;
    let mut value = String::new();
    let mut chars = s.char_indices();
    let mut end_idx = 0;

    while let Some((idx, ch)) = chars.next() {
        if ch == '\\' {
            // Escape sequence
            if let Some((_, next_ch)) = chars.next() {
                match next_ch {
                    'n' => value.push('\n'),
                    't' => value.push('\t'),
                    'r' => value.push('\r'),
                    '"' => value.push('"'),
                    '\\' => value.push('\\'),
                    'u' => {
                        // \uXXXX
                        let hex: String = chars.by_ref().take(4).map(|(_, c)| c).collect();
                        if let Ok(cp) = u32::from_str_radix(&hex, 16) {
                            if let Some(c) = char::from_u32(cp) {
                                value.push(c);
                            }
                        }
                    }
                    'U' => {
                        // \UXXXXXXXX
                        let hex: String = chars.by_ref().take(8).map(|(_, c)| c).collect();
                        if let Ok(cp) = u32::from_str_radix(&hex, 16) {
                            if let Some(c) = char::from_u32(cp) {
                                value.push(c);
                            }
                        }
                    }
                    _ => {
                        value.push('\\');
                        value.push(next_ch);
                    }
                }
            }
        } else if ch == '"' {
            end_idx = idx + 1; // position after the closing quote in `s`
            break;
        } else {
            value.push(ch);
        }
    }

    Some((value, &s[end_idx..]))
}

// ─── Typed literal conversion ───────────────────────────────────────────────

pub(super) const XSD_INTEGER: &str = "http://www.w3.org/2001/XMLSchema#integer";
pub(super) const XSD_DECIMAL: &str = "http://www.w3.org/2001/XMLSchema#decimal";
pub(super) const XSD_DOUBLE: &str = "http://www.w3.org/2001/XMLSchema#double";
pub(super) const XSD_FLOAT: &str = "http://www.w3.org/2001/XMLSchema#float";
pub(super) const XSD_DATE: &str = "http://www.w3.org/2001/XMLSchema#dateTime";
pub(super) const XSD_BOOLEAN: &str = "http://www.w3.org/2001/XMLSchema#boolean";

pub(super) fn typed_literal_to_value(text: &str, type_uri: &str) -> Value {
    match type_uri {
        XSD_INTEGER => text
            .parse::<i64>()
            .map(Value::Int64)
            .unwrap_or(Value::String(text.to_string())),
        XSD_DECIMAL => {
            // Wikidata decimals often have leading "+"
            let cleaned = text.trim_start_matches('+');
            if let Ok(i) = cleaned.parse::<i64>() {
                Value::Int64(i)
            } else if let Ok(f) = cleaned.parse::<f64>() {
                Value::Float64(f)
            } else {
                Value::String(text.to_string())
            }
        }
        XSD_DOUBLE | XSD_FLOAT => text
            .parse::<f64>()
            .map(Value::Float64)
            .unwrap_or(Value::String(text.to_string())),
        XSD_BOOLEAN => match text {
            "true" | "1" => Value::Boolean(true),
            "false" | "0" => Value::Boolean(false),
            _ => Value::String(text.to_string()),
        },
        XSD_DATE => {
            // Try to parse ISO date, keep as string if it fails
            Value::String(text.to_string())
        }
        _ => {
            // GeoSPARQL WKT literals, etc. — keep as string
            Value::String(text.to_string())
        }
    }
}

// ─── Main loader ────────────────────────────────────────────────────────────

pub(super) fn language_matches(lang: &str, filter: &Option<HashSet<String>>) -> bool {
    match filter {
        None => true,
        Some(langs) => langs.contains(lang),
    }
}

/// Extract text from a literal object, respecting language filter.
pub(super) fn extract_lang_text(
    object: &Object<'_>,
    languages: &Option<HashSet<String>>,
) -> Option<String> {
    match object {
        Object::LangLiteral(text, lang) => {
            if language_matches(lang, languages) {
                Some(text.clone())
            } else {
                None
            }
        }
        Object::Literal(text) => Some(text.clone()),
        _ => None,
    }
}