Skip to main content

cairn_text/
lib.rs

1//! Text indexing (tantivy) + autocomplete + geo-bias + fuzzy.
2//!
3//! Phase 2.5 scope:
4//! - Single doc per Place, multi-value name field carries every localized name.
5//! - `name_prefix` carries prefix-ngram terms for autocomplete.
6//! - Stored fields hydrate hits: place_id, level, kind, lon, lat, admin_path.
7//! - Search supports a layer filter (kind allowlist), fuzzy edit distance
8//!   (forward mode), and a focus point that re-ranks top candidates by
9//!   distance.
10
11use cairn_place::{Coord, Place, PlaceKind};
12use serde::{Deserialize, Serialize};
13use std::path::Path;
14use tantivy::collector::TopDocs;
15use tantivy::query::{BooleanQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery};
16use tantivy::schema::{
17    Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
18    STRING, TEXT,
19};
20use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RemoveLongFilter, TextAnalyzer};
21use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term};
22use thiserror::Error;
23use tracing::debug;
24
25const PREFIX_TOKENIZER: &str = "cairn_prefix";
26const PREFIX_MIN: usize = 1;
27const PREFIX_MAX: usize = 25;
28const WRITER_HEAP: usize = 64 * 1024 * 1024;
29const RERANK_MULTIPLIER: usize = 5;
30const MAX_FUZZY_DISTANCE: u8 = 2;
31
32#[derive(Debug, Error)]
33pub enum TextError {
34    #[error("tantivy: {0}")]
35    Tantivy(#[from] tantivy::TantivyError),
36    #[error("query: {0}")]
37    Query(#[from] tantivy::query::QueryParserError),
38    #[error("io: {0}")]
39    Io(#[from] std::io::Error),
40}
41
42#[derive(Clone, Debug, Serialize)]
43pub struct Hit {
44    pub place_id: u64,
45    pub name: String,
46    pub kind: String,
47    pub level: u64,
48    pub lon: f64,
49    pub lat: f64,
50    pub score: f32,
51    pub admin_path: Vec<u64>,
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub distance_km: Option<f64>,
54}
55
56#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
57pub enum SearchMode {
58    #[default]
59    Search,
60    Autocomplete,
61}
62
63#[derive(Clone, Debug)]
64pub struct SearchOptions {
65    pub mode: SearchMode,
66    pub limit: usize,
67    /// Maximum edit distance for fuzzy matching. 0 disables. Capped at 2.
68    /// Only honored when `mode == Search`.
69    pub fuzzy: u8,
70    /// Restrict results to these `kind` values (empty = no filter).
71    pub layers: Vec<String>,
72    /// Focus point used to re-rank top candidates by distance.
73    pub focus: Option<Coord>,
74    /// Weight for the distance penalty in the geo-bias re-rank step.
75    /// final_score = bm25 / (1 + focus_weight * km).
76    pub focus_weight: f64,
77}
78
79impl Default for SearchOptions {
80    fn default() -> Self {
81        Self {
82            mode: SearchMode::Search,
83            limit: 10,
84            fuzzy: 0,
85            layers: Vec::new(),
86            focus: None,
87            focus_weight: 0.5,
88        }
89    }
90}
91
92struct TextSchema {
93    schema: Schema,
94    name: Field,
95    name_prefix: Field,
96    place_id: Field,
97    level: Field,
98    kind: Field,
99    lon: Field,
100    lat: Field,
101    admin_path: Field,
102}
103
104impl TextSchema {
105    fn build() -> Self {
106        let mut sb = Schema::builder();
107        let prefix_indexing = TextFieldIndexing::default()
108            .set_tokenizer(PREFIX_TOKENIZER)
109            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
110        let prefix_options = TextOptions::default().set_indexing_options(prefix_indexing);
111
112        let name = sb.add_text_field("name", TEXT | STORED);
113        let name_prefix = sb.add_text_field("name_prefix", prefix_options);
114        let place_id = sb.add_u64_field("place_id", FAST | STORED | INDEXED);
115        let level = sb.add_u64_field("level", FAST | STORED | INDEXED);
116        let kind = sb.add_text_field("kind", STRING | STORED);
117        let lon = sb.add_f64_field("lon", STORED);
118        let lat = sb.add_f64_field("lat", STORED);
119        let admin_path = sb.add_u64_field("admin_path", STORED);
120        let schema = sb.build();
121        Self {
122            schema,
123            name,
124            name_prefix,
125            place_id,
126            level,
127            kind,
128            lon,
129            lat,
130            admin_path,
131        }
132    }
133}
134
135fn register_prefix_tokenizer(index: &Index) -> Result<(), TextError> {
136    let tokenizer = TextAnalyzer::builder(
137        NgramTokenizer::prefix_only(PREFIX_MIN, PREFIX_MAX)
138            .map_err(|e| tantivy::TantivyError::SystemError(format!("{e:?}")))?,
139    )
140    .filter(LowerCaser)
141    .filter(RemoveLongFilter::limit(64))
142    .build();
143    index.tokenizers().register(PREFIX_TOKENIZER, tokenizer);
144    Ok(())
145}
146
147pub fn kind_str(kind: PlaceKind) -> &'static str {
148    match kind {
149        PlaceKind::Country => "country",
150        PlaceKind::Region => "region",
151        PlaceKind::County => "county",
152        PlaceKind::City => "city",
153        PlaceKind::District => "district",
154        PlaceKind::Neighborhood => "neighborhood",
155        PlaceKind::Street => "street",
156        PlaceKind::Address => "address",
157        PlaceKind::Poi => "poi",
158        PlaceKind::Postcode => "postcode",
159    }
160}
161
162/// Build a fresh tantivy index from a stream of [`Place`] values.
163pub fn build_index<I>(dir: &Path, places: I) -> Result<usize, TextError>
164where
165    I: IntoIterator<Item = Place>,
166{
167    if dir.exists() {
168        std::fs::remove_dir_all(dir)?;
169    }
170    std::fs::create_dir_all(dir)?;
171
172    let schema = TextSchema::build();
173    let index = Index::create_in_dir(dir, schema.schema.clone())?;
174    register_prefix_tokenizer(&index)?;
175    let mut writer: IndexWriter = index.writer(WRITER_HEAP)?;
176
177    let mut doc_count = 0usize;
178    for place in places {
179        if place.names.is_empty() {
180            continue;
181        }
182        let mut doc = TantivyDocument::default();
183        for n in &place.names {
184            doc.add_text(schema.name, &n.value);
185            doc.add_text(schema.name_prefix, &n.value);
186        }
187        doc.add_u64(schema.place_id, place.id.0);
188        doc.add_u64(schema.level, place.id.level() as u64);
189        doc.add_text(schema.kind, kind_str(place.kind));
190        doc.add_f64(schema.lon, place.centroid.lon);
191        doc.add_f64(schema.lat, place.centroid.lat);
192        for ancestor in &place.admin_path {
193            doc.add_u64(schema.admin_path, ancestor.0);
194        }
195        writer.add_document(doc)?;
196        doc_count += 1;
197    }
198    writer.commit()?;
199    debug!(docs = doc_count, "tantivy index committed");
200    Ok(doc_count)
201}
202
203pub struct TextIndex {
204    index: Index,
205    reader: IndexReader,
206    schema: TextSchema,
207}
208
209impl TextIndex {
210    pub fn open(dir: &Path) -> Result<Self, TextError> {
211        let index = Index::open_in_dir(dir)?;
212        register_prefix_tokenizer(&index)?;
213        let reader = index
214            .reader_builder()
215            .reload_policy(ReloadPolicy::Manual)
216            .try_into()?;
217        let schema = TextSchema::build();
218        Ok(Self {
219            index,
220            reader,
221            schema,
222        })
223    }
224
225    pub fn search(&self, query: &str, opts: &SearchOptions) -> Result<Vec<Hit>, TextError> {
226        let trimmed = query.trim();
227        if trimmed.is_empty() {
228            return Ok(Vec::new());
229        }
230
231        let text_q = self.build_text_query(trimmed, opts)?;
232        let combined = self.apply_layer_filter(text_q, &opts.layers);
233
234        let candidate_limit = if opts.focus.is_some() {
235            opts.limit
236                .saturating_mul(RERANK_MULTIPLIER)
237                .clamp(opts.limit, 200)
238        } else {
239            opts.limit
240        };
241
242        let searcher = self.reader.searcher();
243        let raw = searcher.search(&combined, &TopDocs::with_limit(candidate_limit))?;
244
245        let mut hits: Vec<Hit> = Vec::with_capacity(raw.len());
246        for (score, addr) in raw {
247            let doc: TantivyDocument = searcher.doc(addr)?;
248            hits.push(self.hit_from_doc(score, &doc));
249        }
250
251        if let Some(focus) = opts.focus {
252            apply_geo_bias(&mut hits, focus, opts.focus_weight);
253        }
254        hits.truncate(opts.limit);
255        Ok(hits)
256    }
257
258    fn build_text_query(
259        &self,
260        query: &str,
261        opts: &SearchOptions,
262    ) -> Result<Box<dyn Query>, TextError> {
263        let field = match opts.mode {
264            SearchMode::Search => self.schema.name,
265            SearchMode::Autocomplete => self.schema.name_prefix,
266        };
267
268        let fuzzy = opts.fuzzy.min(MAX_FUZZY_DISTANCE);
269        if fuzzy == 0 || matches!(opts.mode, SearchMode::Autocomplete) {
270            // Default to the QueryParser path so users keep tantivy's
271            // boolean / phrase syntax. Autocomplete also stays exact-prefix
272            // because mixing fuzzy + ngram explodes the term space.
273            let parser = QueryParser::for_index(&self.index, vec![field]);
274            return Ok(parser.parse_query(query)?);
275        }
276
277        // Forward search with fuzzy distance: union FuzzyTermQuery per token.
278        let lowered = query.to_lowercase();
279        let tokens: Vec<&str> = lowered.split_whitespace().collect();
280        if tokens.is_empty() {
281            let parser = QueryParser::for_index(&self.index, vec![field]);
282            return Ok(parser.parse_query(query)?);
283        }
284        let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::with_capacity(tokens.len());
285        for tok in tokens {
286            let term = Term::from_field_text(field, tok);
287            let q = FuzzyTermQuery::new(term, fuzzy, true);
288            clauses.push((Occur::Should, Box::new(q) as Box<dyn Query>));
289        }
290        Ok(Box::new(BooleanQuery::new(clauses)))
291    }
292
293    fn apply_layer_filter(&self, text_q: Box<dyn Query>, layers: &[String]) -> Box<dyn Query> {
294        if layers.is_empty() {
295            return text_q;
296        }
297        let mut layer_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::with_capacity(layers.len());
298        for layer in layers {
299            let term = Term::from_field_text(self.schema.kind, layer);
300            let q = TermQuery::new(term, IndexRecordOption::Basic);
301            layer_clauses.push((Occur::Should, Box::new(q) as Box<dyn Query>));
302        }
303        let layer_q: Box<dyn Query> = Box::new(BooleanQuery::new(layer_clauses));
304        Box::new(BooleanQuery::new(vec![
305            (Occur::Must, text_q),
306            (Occur::Must, layer_q),
307        ]))
308    }
309
310    fn hit_from_doc(&self, score: f32, doc: &TantivyDocument) -> Hit {
311        let admin_path: Vec<u64> = doc
312            .get_all(self.schema.admin_path)
313            .filter_map(|v| v.as_u64())
314            .collect();
315        Hit {
316            place_id: doc
317                .get_first(self.schema.place_id)
318                .and_then(|v| v.as_u64())
319                .unwrap_or(0),
320            name: doc
321                .get_first(self.schema.name)
322                .and_then(|v| v.as_str())
323                .unwrap_or("")
324                .to_string(),
325            kind: doc
326                .get_first(self.schema.kind)
327                .and_then(|v| v.as_str())
328                .unwrap_or("")
329                .to_string(),
330            level: doc
331                .get_first(self.schema.level)
332                .and_then(|v| v.as_u64())
333                .unwrap_or(0),
334            lon: doc
335                .get_first(self.schema.lon)
336                .and_then(|v| v.as_f64())
337                .unwrap_or(0.0),
338            lat: doc
339                .get_first(self.schema.lat)
340                .and_then(|v| v.as_f64())
341                .unwrap_or(0.0),
342            score,
343            admin_path,
344            distance_km: None,
345        }
346    }
347}
348
349fn apply_geo_bias(hits: &mut [Hit], focus: Coord, weight: f64) {
350    let weight = weight.max(0.0);
351    for h in hits.iter_mut() {
352        let km = haversine_km(focus.lat, focus.lon, h.lat, h.lon);
353        h.distance_km = Some(km);
354        let blended = (h.score as f64) / (1.0 + weight * km);
355        h.score = blended as f32;
356    }
357    hits.sort_by(|a, b| {
358        b.score
359            .partial_cmp(&a.score)
360            .unwrap_or(std::cmp::Ordering::Equal)
361    });
362}
363
364fn haversine_km(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
365    const EARTH_KM: f64 = 6371.0088;
366    let to_rad = std::f64::consts::PI / 180.0;
367    let phi1 = lat1 * to_rad;
368    let phi2 = lat2 * to_rad;
369    let dphi = (lat2 - lat1) * to_rad;
370    let dlam = (lon2 - lon1) * to_rad;
371    let a = (dphi / 2.0).sin().powi(2) + phi1.cos() * phi2.cos() * (dlam / 2.0).sin().powi(2);
372    2.0 * EARTH_KM * a.sqrt().asin()
373}
374
375#[cfg(test)]
376mod tests {
377    use super::*;
378    use cairn_place::{Coord, LocalizedName, PlaceId};
379
380    fn vaduz() -> Place {
381        Place {
382            id: PlaceId::new(1, 49509, 1).unwrap(),
383            kind: PlaceKind::City,
384            names: vec![
385                LocalizedName {
386                    lang: "default".into(),
387                    value: "Vaduz".into(),
388                },
389                LocalizedName {
390                    lang: "de".into(),
391                    value: "Vaduz".into(),
392                },
393            ],
394            centroid: Coord {
395                lon: 9.5209,
396                lat: 47.1410,
397            },
398            admin_path: vec![PlaceId::new(0, 49509, 1).unwrap()],
399            tags: vec![],
400        }
401    }
402
403    fn schaan() -> Place {
404        Place {
405            id: PlaceId::new(1, 49509, 2).unwrap(),
406            kind: PlaceKind::City,
407            names: vec![LocalizedName {
408                lang: "default".into(),
409                value: "Schaan".into(),
410            }],
411            centroid: Coord {
412                lon: 9.5095,
413                lat: 47.1650,
414            },
415            admin_path: vec![PlaceId::new(0, 49509, 1).unwrap()],
416            tags: vec![],
417        }
418    }
419
420    fn liechtenstein_country() -> Place {
421        Place {
422            id: PlaceId::new(0, 49509, 1).unwrap(),
423            kind: PlaceKind::Country,
424            names: vec![LocalizedName {
425                lang: "default".into(),
426                value: "Liechtenstein".into(),
427            }],
428            centroid: Coord {
429                lon: 9.5594,
430                lat: 47.1114,
431            },
432            admin_path: vec![],
433            tags: vec![],
434        }
435    }
436
437    fn tempdir_for_test() -> std::path::PathBuf {
438        use std::sync::atomic::{AtomicUsize, Ordering};
439        static COUNTER: AtomicUsize = AtomicUsize::new(0);
440        let nanos = std::time::SystemTime::now()
441            .duration_since(std::time::UNIX_EPOCH)
442            .unwrap()
443            .as_nanos();
444        let n = COUNTER.fetch_add(1, Ordering::Relaxed);
445        let d = std::env::temp_dir().join(format!(
446            "cairn-text-test-{}-{}-{}",
447            std::process::id(),
448            nanos,
449            n
450        ));
451        std::fs::create_dir_all(&d).unwrap();
452        d
453    }
454
455    #[test]
456    fn build_and_search() {
457        let dir = tempdir_for_test();
458        let docs = build_index(&dir, vec![vaduz(), schaan()]).unwrap();
459        assert_eq!(docs, 2);
460
461        let idx = TextIndex::open(&dir).unwrap();
462        let hits = idx.search("vaduz", &SearchOptions::default()).unwrap();
463        assert_eq!(hits.len(), 1);
464        assert_eq!(hits[0].name, "Vaduz");
465        assert_eq!(hits[0].admin_path.len(), 1);
466    }
467
468    #[test]
469    fn autocomplete_prefix() {
470        let dir = tempdir_for_test();
471        build_index(&dir, vec![vaduz(), schaan()]).unwrap();
472        let idx = TextIndex::open(&dir).unwrap();
473        let opts = SearchOptions {
474            mode: SearchMode::Autocomplete,
475            ..Default::default()
476        };
477
478        let hits = idx.search("Vad", &opts).unwrap();
479        assert!(hits.iter().any(|h| h.name == "Vaduz"));
480    }
481
482    #[test]
483    fn fuzzy_recovers_typo() {
484        let dir = tempdir_for_test();
485        build_index(&dir, vec![vaduz(), schaan()]).unwrap();
486        let idx = TextIndex::open(&dir).unwrap();
487        let opts = SearchOptions {
488            fuzzy: 2,
489            ..Default::default()
490        };
491        let hits = idx.search("vaaduz", &opts).unwrap();
492        assert!(hits.iter().any(|h| h.name == "Vaduz"));
493    }
494
495    #[test]
496    fn layer_filter_excludes_other_kinds() {
497        let dir = tempdir_for_test();
498        build_index(&dir, vec![vaduz(), liechtenstein_country()]).unwrap();
499        let idx = TextIndex::open(&dir).unwrap();
500
501        let only_country = SearchOptions {
502            layers: vec!["country".into()],
503            ..Default::default()
504        };
505        let hits = idx.search("liechtenstein", &only_country).unwrap();
506        assert!(!hits.is_empty());
507        assert!(hits.iter().all(|h| h.kind == "country"));
508
509        let only_city = SearchOptions {
510            layers: vec!["city".into()],
511            ..Default::default()
512        };
513        let hits = idx.search("liechtenstein", &only_city).unwrap();
514        assert!(hits.is_empty(), "country must not leak into city layer");
515    }
516
517    #[test]
518    fn focus_reranks_nearer_first() {
519        let dir = tempdir_for_test();
520        build_index(&dir, vec![vaduz(), schaan()]).unwrap();
521        let idx = TextIndex::open(&dir).unwrap();
522
523        // Focus on Schaan's centroid; without bias both score equally on
524        // an ambiguous prefix, but Schaan should win on distance.
525        let opts = SearchOptions {
526            mode: SearchMode::Autocomplete,
527            focus: Some(Coord {
528                lon: 9.5095,
529                lat: 47.1650,
530            }),
531            focus_weight: 5.0,
532            limit: 5,
533            ..Default::default()
534        };
535        let hits = idx.search("S", &opts).unwrap();
536        assert!(hits.iter().all(|h| h.distance_km.is_some()));
537        assert!(
538            hits.first().map(|h| h.name == "Schaan").unwrap_or(false),
539            "expected Schaan first, got {:?}",
540            hits
541        );
542    }
543}