Skip to main content

gobby_code/search/fts/
content.rs

1use postgres::Client;
2use postgres::Row;
3
4use crate::config::{Context, ProjectIndexScope};
5use crate::models::ContentSearchHit;
6use crate::visibility::TOMBSTONE_LANGUAGE;
7
8use super::common::{
9    PgParam, bm25_score_expr, param_refs, push_param, push_path_filter, sanitize_pg_search_query,
10    trusted_row_id,
11};
12
13fn content_bm25_order_by_sql(tiebreakers: &[&str]) -> String {
14    let row_id = trusted_row_id("c.id");
15    let mut order_by = format!("{} DESC", bm25_score_expr(&row_id));
16    for tiebreaker in tiebreakers {
17        order_by.push_str(", ");
18        order_by.push_str(tiebreaker);
19    }
20    order_by
21}
22
23/// Full-text search across file content chunks.
24pub fn search_content(
25    conn: &mut Client,
26    query: &str,
27    project_id: &str,
28    language: Option<&str>,
29    paths: &[String],
30    limit: usize,
31) -> Vec<ContentSearchHit> {
32    if query.trim().is_empty() || limit == 0 {
33        return Vec::new();
34    }
35
36    let bm25_query = sanitize_pg_search_query(query);
37    if bm25_query.is_empty() {
38        eprintln!(
39            "gcode: content BM25 search skipped because query contains no pg_search terms; use `gcode grep` for exact text"
40        );
41        return Vec::new();
42    }
43
44    let mut params = Vec::new();
45    let query_placeholder = push_param(&mut params, bm25_query);
46    let project_placeholder = push_param(&mut params, project_id.to_string());
47    let mut conditions = vec![
48        format!("c.content @@@ {query_placeholder}"),
49        format!("c.project_id = {project_placeholder}"),
50    ];
51    if let Some(lang) = language {
52        let placeholder = push_param(&mut params, lang.to_string());
53        conditions.push(format!("c.language = {placeholder}"));
54    }
55    push_path_filter(&mut conditions, &mut params, "c", paths);
56    let limit_placeholder = push_param(&mut params, limit as i64);
57    let order_by = content_bm25_order_by_sql(&["c.id ASC"]);
58    let refs = param_refs(&params);
59    let sql = format!(
60        "SELECT c.file_path,
61                c.line_start::BIGINT AS line_start,
62                c.line_end::BIGINT AS line_end,
63                c.language,
64                c.content
65         FROM code_content_chunks c
66         JOIN code_indexed_files cf
67           ON cf.project_id = c.project_id AND cf.file_path = c.file_path
68         WHERE {}
69         ORDER BY {order_by}
70         LIMIT {limit_placeholder}",
71        conditions.join(" AND ")
72    );
73
74    match conn.query(&sql, &refs) {
75        Ok(rows) => content_hits_from_rows(&rows, query),
76        Err(error) => {
77            eprintln!("gcode: content BM25 search failed; pg_search is required: {error}");
78            Vec::new()
79        }
80    }
81}
82
83pub fn search_content_visible(
84    conn: &mut Client,
85    query: &str,
86    ctx: &Context,
87    language: Option<&str>,
88    paths: &[String],
89    limit: usize,
90) -> Vec<ContentSearchHit> {
91    if query.trim().is_empty() || limit == 0 {
92        return Vec::new();
93    }
94
95    let bm25_query = sanitize_pg_search_query(query);
96    if bm25_query.is_empty() {
97        eprintln!(
98            "gcode: visible content BM25 search skipped because query contains no pg_search terms; use `gcode grep` for exact text"
99        );
100        return Vec::new();
101    }
102
103    let mut params = Vec::new();
104    let visible_files_sql = visible_files_sql(ctx, &mut params);
105    let query_placeholder = push_param(&mut params, bm25_query);
106    let mut conditions = vec![format!("c.content @@@ {query_placeholder}")];
107    if let Some(lang) = language {
108        let placeholder = push_param(&mut params, lang.to_string());
109        conditions.push(format!("c.language = {placeholder}"));
110    }
111    push_path_filter(&mut conditions, &mut params, "c", paths);
112    let limit_placeholder = push_param(&mut params, limit as i64);
113    let order_by = content_bm25_order_by_sql(&["c.project_id ASC", "c.id ASC"]);
114    let refs = param_refs(&params);
115    let sql = format!(
116        "WITH visible_files AS ({visible_files_sql})
117         SELECT c.file_path,
118                c.line_start::BIGINT AS line_start,
119                c.line_end::BIGINT AS line_end,
120                c.language,
121                c.content
122         FROM code_content_chunks c
123         JOIN visible_files vf
124           ON vf.project_id = c.project_id AND vf.file_path = c.file_path
125         WHERE {}
126         ORDER BY {order_by}
127         LIMIT {limit_placeholder}",
128        conditions.join(" AND ")
129    );
130
131    match conn.query(&sql, &refs) {
132        Ok(rows) => content_hits_from_rows(&rows, query),
133        Err(error) => {
134            eprintln!("gcode: visible content BM25 search failed; pg_search is required: {error}");
135            Vec::new()
136        }
137    }
138}
139
140fn visible_files_sql(ctx: &Context, params: &mut Vec<PgParam>) -> String {
141    match &ctx.index_scope {
142        ProjectIndexScope::Single => {
143            let project_placeholder = push_param(params, ctx.project_id.clone());
144            let tombstone_placeholder = push_param(params, TOMBSTONE_LANGUAGE.to_string());
145            format!(
146                "SELECT file_path, project_id
147                 FROM code_indexed_files
148                 WHERE project_id = {project_placeholder}
149                   AND language != {tombstone_placeholder}"
150            )
151        }
152        ProjectIndexScope::Overlay {
153            overlay_project_id,
154            parent_project_id,
155            ..
156        } => {
157            let overlay_placeholder = push_param(params, overlay_project_id.clone());
158            let parent_placeholder = push_param(params, parent_project_id.clone());
159            let tombstone_placeholder = push_param(params, TOMBSTONE_LANGUAGE.to_string());
160            format!(
161                "SELECT file_path, project_id
162                 FROM code_indexed_files
163                 WHERE project_id = {overlay_placeholder}
164                   AND language != {tombstone_placeholder}
165                 UNION ALL
166                 SELECT pf.file_path, pf.project_id
167                 FROM code_indexed_files pf
168                 WHERE pf.project_id = {parent_placeholder}
169                   AND pf.language != {tombstone_placeholder}
170                   AND NOT EXISTS (
171                       SELECT 1 FROM code_indexed_files of
172                       WHERE of.project_id = {overlay_placeholder}
173                         AND of.file_path = pf.file_path
174                   )"
175            )
176        }
177    }
178}
179
180fn content_hits_from_rows(rows: &[Row], query: &str) -> Vec<ContentSearchHit> {
181    let tokens = snippet_tokens(query);
182    rows.iter()
183        .filter_map(|row| {
184            let content: String = row.try_get("content").ok()?;
185            let line_start = usize::try_from(row.try_get::<_, i64>("line_start").ok()?).ok()?;
186            let line_end = usize::try_from(row.try_get::<_, i64>("line_end").ok()?).ok()?;
187            Some(ContentSearchHit {
188                file_path: row.try_get("file_path").ok()?,
189                line_start,
190                line_end,
191                snippet: make_snippet_with_tokens(&content, &tokens),
192                language: row.try_get("language").ok()?,
193            })
194        })
195        .collect()
196}
197
198#[cfg(test)]
199pub(super) fn make_snippet(content: &str, query: &str) -> String {
200    let tokens = snippet_tokens(query);
201    make_snippet_with_tokens(content, &tokens)
202}
203
204fn snippet_tokens(query: &str) -> Vec<String> {
205    query
206        .split_whitespace()
207        .map(str::to_lowercase)
208        .filter(|token| !token.is_empty())
209        .collect()
210}
211
212fn make_snippet_with_tokens(content: &str, tokens: &[String]) -> String {
213    let (lower_content, lower_byte_to_original_char) = lowercase_with_original_char_map(content);
214    let match_at = tokens
215        .iter()
216        .filter_map(|token| {
217            lower_content
218                .find(token)
219                .and_then(|byte_index| lower_byte_to_original_char.get(byte_index).copied())
220        })
221        .min();
222    let match_at = match_at.unwrap_or(0);
223    let start = match_at.saturating_sub(60);
224    let content_len = content.chars().count();
225    let end = match_at.saturating_add(120).min(content_len);
226    content.chars().skip(start).take(end - start).collect()
227}
228
229fn lowercase_with_original_char_map(content: &str) -> (String, Vec<usize>) {
230    // Unicode lowercase expansion can produce more bytes than the source.
231    let reserve = content.len().saturating_mul(2);
232    let mut lower = String::with_capacity(reserve);
233    let mut lower_byte_to_original_char = Vec::with_capacity(reserve);
234    for (original_char_index, ch) in content.chars().enumerate() {
235        for lower_ch in ch.to_lowercase() {
236            let mut buf = [0; 4];
237            let encoded = lower_ch.encode_utf8(&mut buf);
238            lower_byte_to_original_char
239                .extend(std::iter::repeat_n(original_char_index, encoded.len()));
240            lower.push(lower_ch);
241        }
242    }
243    (lower, lower_byte_to_original_char)
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    fn assert_uses_pdb_score(sql: &str) {
251        assert!(sql.contains("pdb.score(c.id)"));
252        assert!(!sql.contains("pg_search.score"));
253    }
254
255    #[test]
256    fn content_bm25_order_by_uses_pdb_score() {
257        let sql = content_bm25_order_by_sql(&["c.id ASC"]);
258
259        assert_eq!(sql, "pdb.score(c.id) DESC, c.id ASC");
260        assert_uses_pdb_score(&sql);
261    }
262
263    #[test]
264    fn visible_content_bm25_order_by_uses_pdb_score() {
265        let sql = content_bm25_order_by_sql(&["c.project_id ASC", "c.id ASC"]);
266
267        assert_eq!(sql, "pdb.score(c.id) DESC, c.project_id ASC, c.id ASC");
268        assert_uses_pdb_score(&sql);
269    }
270}