syntext 1.1.1

Hybrid code search index for agent workflows
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
//! Sparse n-gram tokenizer.
//!
//! Extracts variable-length grams from byte sequences using two tiers of
//! boundary detection:
//!
//! 1. **Forced boundaries**: characters that always delimit tokens in source
//!    code (whitespace, punctuation, operators, underscore). These are
//!    context-independent: the same byte always produces a boundary regardless
//!    of surrounding content.
//! 2. **Weight-based boundaries**: within alphanumeric spans, a pre-trained
//!    byte-pair frequency table provides additional subdivision at rare bigrams.
//!
//! # Why forced boundaries exist
//!
//! The original weight-only approach was context-sensitive: boundaries depended
//! on surrounding bytes, so a query's edge grams could differ from the same
//! bytes' grams in a document. Common separators like `space->letter` had
//! weights below `BOUNDARY_THRESHOLD`, causing false negatives. Forced
//! boundaries eliminate this for all token-aligned queries.
//!
//! # Index time: `build_all`
//!
//! Emits hashes of all consecutive-boundary spans with length >= `MIN_GRAM_LEN`.
//! Spans shorter than `MIN_GRAM_LEN` are omitted (no gram can cover them; the
//! verifier handles matches that fall entirely in short spans).
//!
//! # Query time: `build_covering`
//!
//! Greedy left-to-right: emits one gram per consecutive-boundary span that is
//! >= `MIN_GRAM_LEN`. Returns `None` if no such spans exist (full scan required).

/// Weight table for bigram frequencies.
pub mod weights;

/// Query-time covering set extraction (build_covering, build_covering_inner).
mod covering;
pub use covering::{build_covering, build_covering_inner};

use weights::BIGRAM_WEIGHTS;
use xxhash_rust::xxh64::xxh64;

/// Bigram weight threshold. Bigrams with weight >= this are gram boundaries.
///
/// Calibrated against the trained weight table so that underscore-separated
/// code identifiers (snake_case) produce natural gram splits, while common
/// code keywords ("function", "return", "import") remain as single grams.
/// At 28000: `'_'→'q'` (30797) is a boundary, but `'q'→'u'` (17728) is not,
/// so "parse_query" → ["parse_", "query"] rather than one long gram.
///
/// Tune this value if gram quality is poor (too many short grams → lower it;
/// too many long grams → raise it).
pub const BOUNDARY_THRESHOLD: u16 = 28000;

/// Minimum gram length in bytes. Shorter spans are not indexed.
pub const MIN_GRAM_LEN: usize = 3;

/// Maximum gram length in bytes. Longer spans are truncated to the next
/// internal boundary before this limit.
pub const MAX_GRAM_LEN: usize = 128;

// ---------------------------------------------------------------------------
// Thread-local buffer shrink policy
// ---------------------------------------------------------------------------

/// Minimum capacity retained by boundary-position buffers (`Vec<usize>`).
/// Small enough that idle rayon workers don't hold noticeable memory, large
/// enough to avoid reallocating for typical source files.
const BOUNDARY_BUF_MIN_CAPACITY: usize = 256;

/// Minimum capacity retained by the lowercase copy buffer (`Vec<u8>`).
/// Larger than `BOUNDARY_BUF_MIN_CAPACITY` because this buffer stores one
/// byte per input byte (vs. one `usize` per boundary position).
const LOWER_BUF_MIN_CAPACITY: usize = 4096;

/// Shrink trigger multiplier. When a buffer's capacity exceeds the larger of
/// its minimum and `needed * SHRINK_TRIGGER_FACTOR`, it is shrunk back to
/// `max(minimum, needed)`. The 4x factor prevents shrink/grow churn when file
/// sizes vary by less than 4:1 within a batch.
const SHRINK_TRIGGER_FACTOR: usize = 4;

// ---------------------------------------------------------------------------
// T014: Gram hash
// ---------------------------------------------------------------------------

/// Hash a gram (variable-length byte slice) to a u64 key for dictionary lookup.
///
/// Uses xxhash64 with seed 0. Same seed must be used at both index and query
/// time.
#[inline]
pub fn gram_hash(gram: &[u8]) -> u64 {
    xxh64(gram, 0)
}

// ---------------------------------------------------------------------------
// Internal boundary detection (two-tier: forced + weight-based)
// ---------------------------------------------------------------------------

/// Characters that always create gram boundaries regardless of bigram weight.
/// These are the natural token delimiters in source code across all major
/// languages. Forced boundaries are context-independent: the same byte always
/// produces a boundary, so query grams match document grams.
#[inline]
fn is_forced_boundary(byte: u8) -> bool {
    matches!(
        byte,
        // Whitespace
        b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C
        // Brackets and grouping
        | b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>'
        // Statement/expression punctuation
        | b'.' | b',' | b':' | b';'
        // Operators
        | b'=' | b'+' | b'-' | b'*' | b'/' | b'%'
        | b'!' | b'&' | b'|' | b'^' | b'~'
        // String/char delimiters
        | b'"' | b'\'' | b'`'
        // Sigils and separators
        | b'@' | b'#' | b'$' | b'?'
        // Underscore: critical for snake_case identifiers
        | b'_'
        // Control characters
        | 0x00..=0x08 | 0x0E..=0x1F | 0x7F
    )
}

/// Returns the list of boundary positions in `bytes`.
///
/// Position 0 and `bytes.len()` are always included. Interior positions use
/// two-tier detection:
/// 1. Forced: either side of position `i` is a delimiter byte.
/// 2. Camel-case: a lowercase ASCII letter followed by uppercase ASCII.
/// 3. Weight-based: `BIGRAM_WEIGHTS[lower(bytes[i-1])*256 + lower(bytes[i])] >= BOUNDARY_THRESHOLD`.
fn boundary_positions(bytes: &[u8]) -> Vec<usize> {
    let n = bytes.len();
    let mut positions = Vec::with_capacity(n / 4);
    positions.push(0);

    for i in 1..n {
        // Tier 1: forced boundary if either adjacent byte is a delimiter
        if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
            positions.push(i);
            continue;
        }
        // Tier 2: lowercase -> uppercase transition in CamelCase identifiers.
        if bytes[i - 1].is_ascii_lowercase() && bytes[i].is_ascii_uppercase() {
            positions.push(i);
            continue;
        }
        // Tier 3: weight-based boundary for rare bigrams within alphanumeric spans
        let left = bytes[i - 1].to_ascii_lowercase();
        let right = bytes[i].to_ascii_lowercase();
        let idx = (left as usize) << 8 | (right as usize);
        if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
            positions.push(i);
        }
    }

    if n > 0 {
        positions.push(n);
    }
    // Forced + weight could double-trigger at the same position
    positions.dedup();
    positions
}

/// Like `boundary_positions` but skips inner `to_ascii_lowercase()` since
/// the caller guarantees `bytes` is already lowercase.
#[cfg(test)]
fn boundary_positions_lower(bytes: &[u8]) -> Vec<usize> {
    let n = bytes.len();
    let mut positions = Vec::with_capacity(n / 4);
    positions.push(0);
    for i in 1..n {
        if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
            positions.push(i);
            continue;
        }
        // CamelCase tier skipped: already-lowercase input has no uppercase bytes.
        let idx = (bytes[i - 1] as usize) << 8 | (bytes[i] as usize);
        if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
            positions.push(i);
        }
    }
    if n > 0 {
        positions.push(n);
    }
    positions.dedup();
    positions
}

/// Thread-local buffered variant of `boundary_positions_lower` using a callback
/// pattern to avoid cloning the result Vec on every call.
///
/// The caller receives a `&[usize]` slice valid only for the duration of `f`.
/// This eliminates the allocation that `buf.clone()` previously incurred on
/// every call, which matters on the hot path in `build_all` during index build.
fn with_boundary_positions_lower<F, R>(bytes: &[u8], f: F) -> R
where
    F: FnOnce(&[usize]) -> R,
{
    thread_local! {
        static BUF: std::cell::RefCell<Vec<usize>> = std::cell::RefCell::new(Vec::with_capacity(256));
    }
    BUF.with(|buf| {
        let mut buf = buf.borrow_mut();
        buf.clear();
        // Shrink the buffer when its capacity far exceeds the current input.
        // This bounds per-thread memory in rayon workers that process large
        // files early in a batch and small files afterward.
        let needed = bytes.len() / 4 + 16;
        if buf.capacity() > BOUNDARY_BUF_MIN_CAPACITY.max(needed * SHRINK_TRIGGER_FACTOR) {
            buf.shrink_to(BOUNDARY_BUF_MIN_CAPACITY.max(needed));
        }
        let n = bytes.len();
        buf.push(0);
        for i in 1..n {
            if is_forced_boundary(bytes[i]) || is_forced_boundary(bytes[i - 1]) {
                buf.push(i);
                continue;
            }
            let idx = (bytes[i - 1] as usize) << 8 | (bytes[i] as usize);
            if BIGRAM_WEIGHTS[idx] >= BOUNDARY_THRESHOLD {
                buf.push(i);
            }
        }
        if n > 0 {
            buf.push(n);
        }
        buf.dedup();
        f(&buf)
    })
}

// ---------------------------------------------------------------------------
// T012: build_all -- index-time gram extraction
// ---------------------------------------------------------------------------

thread_local! {
    // Reusable buffer for the lowercase copy of `input` in build_all.
    // Eliminates one Vec<u8> allocation per file on the index-build hot path.
    static LOWER_BUF: std::cell::RefCell<Vec<u8>> =
        std::cell::RefCell::new(Vec::with_capacity(4096));
}

/// Extract all sparse n-grams from `input` for index construction.
///
/// Finds all boundary positions from the original bytes, lowercases the spans,
/// and emits the hash of each consecutive-boundary span with length in
/// `[MIN_GRAM_LEN, MAX_GRAM_LEN]`.
///
/// Returns an unordered list of hashes. Duplicates are possible and should
/// be deduplicated by the caller (e.g. into a `HashSet` before writing to
/// the segment dictionary).
///
/// # Example
///
/// ```
/// let grams = syntext::tokenizer::build_all(b"parse_query");
/// // Forced boundary at '_' splits into "parse" and "query".
/// assert!(!grams.is_empty());
/// ```
pub fn build_all(input: &[u8]) -> Vec<u64> {
    if input.is_empty() {
        return Vec::new();
    }

    LOWER_BUF.with(|buf| {
        // Fill the thread-local buffer with lowercased input.
        {
            let mut lower = buf.borrow_mut();
            lower.clear();
            // Shrink when capacity far exceeds current input (same shape as
            // the BUF policy in with_boundary_positions_lower, but with a
            // larger floor: LOWER_BUF stores one byte per input byte, while
            // BUF stores usize boundary positions).
            if lower.capacity() > LOWER_BUF_MIN_CAPACITY.max(input.len() * SHRINK_TRIGGER_FACTOR) {
                lower.shrink_to(LOWER_BUF_MIN_CAPACITY.max(input.len()));
            }
            lower.extend(input.iter().map(|b| b.to_ascii_lowercase()));
        } // mutable borrow released here

        let lower = buf.borrow();
        with_boundary_positions_lower(&lower, |lower_boundaries| {
            let mut hashes = Vec::new();
            append_grams_for_boundaries(&mut hashes, &lower, lower_boundaries);

            // Preserve the token-aligned lowercase spans, then add only the extra
            // spans unlocked by lowercase->uppercase transitions in CamelCase tokens.
            let camel_boundaries = camel_case_boundaries(input);
            if camel_boundaries.is_empty() {
                return hashes;
            }

            let merged_boundaries = merge_boundaries(lower_boundaries, &camel_boundaries);
            append_new_grams_for_boundaries(
                &mut hashes,
                &lower,
                lower_boundaries,
                &merged_boundaries,
            );

            hashes
        })
    })
}

fn append_grams_for_boundaries(hashes: &mut Vec<u64>, lower: &[u8], boundaries: &[usize]) {
    for w in boundaries.windows(2) {
        let (start, end) = (w[0], w[1]);
        let span = end - start;
        if (MIN_GRAM_LEN..=MAX_GRAM_LEN).contains(&span) {
            hashes.push(gram_hash(&lower[start..end]));
        }
        // Spans shorter than MIN_GRAM_LEN: no gram emitted. Queries whose
        // covering set falls entirely in such a span will fall back to full scan.
        // Spans longer than MAX_GRAM_LEN: skipped (very long tokens are not
        // selective and waste posting list space). The verifier handles matches.
    }
}

fn camel_case_boundaries(bytes: &[u8]) -> Vec<usize> {
    let mut positions = Vec::new();
    for i in 1..bytes.len() {
        if bytes[i - 1].is_ascii_lowercase() && bytes[i].is_ascii_uppercase() {
            positions.push(i);
        }
    }
    positions
}

fn merge_boundaries(base: &[usize], extra: &[usize]) -> Vec<usize> {
    let mut merged = Vec::with_capacity(base.len() + extra.len());
    let mut base_i = 0;
    let mut extra_i = 0;

    while base_i < base.len() || extra_i < extra.len() {
        let next = match (base.get(base_i), extra.get(extra_i)) {
            (Some(&base_pos), Some(&extra_pos)) if base_pos <= extra_pos => {
                base_i += 1;
                if base_pos == extra_pos {
                    extra_i += 1;
                }
                base_pos
            }
            (Some(&_base_pos), Some(&extra_pos)) => {
                extra_i += 1;
                extra_pos
            }
            (Some(&base_pos), None) => {
                base_i += 1;
                base_pos
            }
            (None, Some(&extra_pos)) => {
                extra_i += 1;
                extra_pos
            }
            (None, None) => break,
        };

        if merged.last().copied() != Some(next) {
            merged.push(next);
        }
    }

    merged
}

fn append_new_grams_for_boundaries(
    hashes: &mut Vec<u64>,
    lower: &[u8],
    base_boundaries: &[usize],
    merged_boundaries: &[usize],
) {
    let mut base_windows = base_boundaries.windows(2);
    let mut current_base = base_windows.next();

    for merged in merged_boundaries.windows(2) {
        let merged_pair = (merged[0], merged[1]);
        while let Some(base) = current_base {
            let base_pair = (base[0], base[1]);
            if base_pair < merged_pair {
                current_base = base_windows.next();
            } else {
                break;
            }
        }

        if current_base
            .map(|base| (base[0], base[1]) == merged_pair)
            .unwrap_or(false)
        {
            continue;
        }

        let span = merged_pair.1 - merged_pair.0;
        if (MIN_GRAM_LEN..=MAX_GRAM_LEN).contains(&span) {
            hashes.push(gram_hash(&lower[merged_pair.0..merged_pair.1]));
        }
    }
}

// ---------------------------------------------------------------------------
// Tests (T015 live in tests/unit/tokenizer.rs)
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests;