1use cairn_place::{Coord, Place, PlaceKind};
12use serde::{Deserialize, Serialize};
13use std::path::Path;
14use tantivy::collector::TopDocs;
15use tantivy::query::{BooleanQuery, FuzzyTermQuery, Occur, Query, QueryParser, TermQuery};
16use tantivy::schema::{
17 Field, IndexRecordOption, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
18 STRING, TEXT,
19};
20use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RemoveLongFilter, TextAnalyzer};
21use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy, TantivyDocument, Term};
22use thiserror::Error;
23use tracing::debug;
24
25const PREFIX_TOKENIZER: &str = "cairn_prefix";
26const PREFIX_MIN: usize = 1;
27const PREFIX_MAX: usize = 25;
28const WRITER_HEAP: usize = 64 * 1024 * 1024;
29const RERANK_MULTIPLIER: usize = 5;
30const MAX_FUZZY_DISTANCE: u8 = 2;
31
32#[derive(Debug, Error)]
33pub enum TextError {
34 #[error("tantivy: {0}")]
35 Tantivy(#[from] tantivy::TantivyError),
36 #[error("query: {0}")]
37 Query(#[from] tantivy::query::QueryParserError),
38 #[error("io: {0}")]
39 Io(#[from] std::io::Error),
40}
41
42#[derive(Clone, Debug, Serialize)]
43pub struct Hit {
44 pub place_id: u64,
45 pub name: String,
46 pub kind: String,
47 pub level: u64,
48 pub lon: f64,
49 pub lat: f64,
50 pub score: f32,
51 pub admin_path: Vec<u64>,
52 #[serde(skip_serializing_if = "Option::is_none")]
53 pub distance_km: Option<f64>,
54}
55
56#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
57pub enum SearchMode {
58 #[default]
59 Search,
60 Autocomplete,
61}
62
63#[derive(Clone, Debug)]
64pub struct SearchOptions {
65 pub mode: SearchMode,
66 pub limit: usize,
67 pub fuzzy: u8,
70 pub layers: Vec<String>,
72 pub focus: Option<Coord>,
74 pub focus_weight: f64,
77}
78
79impl Default for SearchOptions {
80 fn default() -> Self {
81 Self {
82 mode: SearchMode::Search,
83 limit: 10,
84 fuzzy: 0,
85 layers: Vec::new(),
86 focus: None,
87 focus_weight: 0.5,
88 }
89 }
90}
91
92struct TextSchema {
93 schema: Schema,
94 name: Field,
95 name_prefix: Field,
96 place_id: Field,
97 level: Field,
98 kind: Field,
99 lon: Field,
100 lat: Field,
101 admin_path: Field,
102}
103
104impl TextSchema {
105 fn build() -> Self {
106 let mut sb = Schema::builder();
107 let prefix_indexing = TextFieldIndexing::default()
108 .set_tokenizer(PREFIX_TOKENIZER)
109 .set_index_option(IndexRecordOption::WithFreqsAndPositions);
110 let prefix_options = TextOptions::default().set_indexing_options(prefix_indexing);
111
112 let name = sb.add_text_field("name", TEXT | STORED);
113 let name_prefix = sb.add_text_field("name_prefix", prefix_options);
114 let place_id = sb.add_u64_field("place_id", FAST | STORED | INDEXED);
115 let level = sb.add_u64_field("level", FAST | STORED | INDEXED);
116 let kind = sb.add_text_field("kind", STRING | STORED);
117 let lon = sb.add_f64_field("lon", STORED);
118 let lat = sb.add_f64_field("lat", STORED);
119 let admin_path = sb.add_u64_field("admin_path", STORED);
120 let schema = sb.build();
121 Self {
122 schema,
123 name,
124 name_prefix,
125 place_id,
126 level,
127 kind,
128 lon,
129 lat,
130 admin_path,
131 }
132 }
133}
134
135fn register_prefix_tokenizer(index: &Index) -> Result<(), TextError> {
136 let tokenizer = TextAnalyzer::builder(
137 NgramTokenizer::prefix_only(PREFIX_MIN, PREFIX_MAX)
138 .map_err(|e| tantivy::TantivyError::SystemError(format!("{e:?}")))?,
139 )
140 .filter(LowerCaser)
141 .filter(RemoveLongFilter::limit(64))
142 .build();
143 index.tokenizers().register(PREFIX_TOKENIZER, tokenizer);
144 Ok(())
145}
146
147pub fn kind_str(kind: PlaceKind) -> &'static str {
148 match kind {
149 PlaceKind::Country => "country",
150 PlaceKind::Region => "region",
151 PlaceKind::County => "county",
152 PlaceKind::City => "city",
153 PlaceKind::District => "district",
154 PlaceKind::Neighborhood => "neighborhood",
155 PlaceKind::Street => "street",
156 PlaceKind::Address => "address",
157 PlaceKind::Poi => "poi",
158 PlaceKind::Postcode => "postcode",
159 }
160}
161
162pub fn build_index<I>(dir: &Path, places: I) -> Result<usize, TextError>
164where
165 I: IntoIterator<Item = Place>,
166{
167 if dir.exists() {
168 std::fs::remove_dir_all(dir)?;
169 }
170 std::fs::create_dir_all(dir)?;
171
172 let schema = TextSchema::build();
173 let index = Index::create_in_dir(dir, schema.schema.clone())?;
174 register_prefix_tokenizer(&index)?;
175 let mut writer: IndexWriter = index.writer(WRITER_HEAP)?;
176
177 let mut doc_count = 0usize;
178 for place in places {
179 if place.names.is_empty() {
180 continue;
181 }
182 let mut doc = TantivyDocument::default();
183 for n in &place.names {
184 doc.add_text(schema.name, &n.value);
185 doc.add_text(schema.name_prefix, &n.value);
186 }
187 doc.add_u64(schema.place_id, place.id.0);
188 doc.add_u64(schema.level, place.id.level() as u64);
189 doc.add_text(schema.kind, kind_str(place.kind));
190 doc.add_f64(schema.lon, place.centroid.lon);
191 doc.add_f64(schema.lat, place.centroid.lat);
192 for ancestor in &place.admin_path {
193 doc.add_u64(schema.admin_path, ancestor.0);
194 }
195 writer.add_document(doc)?;
196 doc_count += 1;
197 }
198 writer.commit()?;
199 debug!(docs = doc_count, "tantivy index committed");
200 Ok(doc_count)
201}
202
203pub struct TextIndex {
204 index: Index,
205 reader: IndexReader,
206 schema: TextSchema,
207}
208
209impl TextIndex {
210 pub fn open(dir: &Path) -> Result<Self, TextError> {
211 let index = Index::open_in_dir(dir)?;
212 register_prefix_tokenizer(&index)?;
213 let reader = index
214 .reader_builder()
215 .reload_policy(ReloadPolicy::Manual)
216 .try_into()?;
217 let schema = TextSchema::build();
218 Ok(Self {
219 index,
220 reader,
221 schema,
222 })
223 }
224
225 pub fn search(&self, query: &str, opts: &SearchOptions) -> Result<Vec<Hit>, TextError> {
226 let trimmed = query.trim();
227 if trimmed.is_empty() {
228 return Ok(Vec::new());
229 }
230
231 let text_q = self.build_text_query(trimmed, opts)?;
232 let combined = self.apply_layer_filter(text_q, &opts.layers);
233
234 let candidate_limit = if opts.focus.is_some() {
235 opts.limit
236 .saturating_mul(RERANK_MULTIPLIER)
237 .clamp(opts.limit, 200)
238 } else {
239 opts.limit
240 };
241
242 let searcher = self.reader.searcher();
243 let raw = searcher.search(&combined, &TopDocs::with_limit(candidate_limit))?;
244
245 let mut hits: Vec<Hit> = Vec::with_capacity(raw.len());
246 for (score, addr) in raw {
247 let doc: TantivyDocument = searcher.doc(addr)?;
248 hits.push(self.hit_from_doc(score, &doc));
249 }
250
251 if let Some(focus) = opts.focus {
252 apply_geo_bias(&mut hits, focus, opts.focus_weight);
253 }
254 hits.truncate(opts.limit);
255 Ok(hits)
256 }
257
258 fn build_text_query(
259 &self,
260 query: &str,
261 opts: &SearchOptions,
262 ) -> Result<Box<dyn Query>, TextError> {
263 let field = match opts.mode {
264 SearchMode::Search => self.schema.name,
265 SearchMode::Autocomplete => self.schema.name_prefix,
266 };
267
268 let fuzzy = opts.fuzzy.min(MAX_FUZZY_DISTANCE);
269 if fuzzy == 0 || matches!(opts.mode, SearchMode::Autocomplete) {
270 let parser = QueryParser::for_index(&self.index, vec![field]);
274 return Ok(parser.parse_query(query)?);
275 }
276
277 let lowered = query.to_lowercase();
279 let tokens: Vec<&str> = lowered.split_whitespace().collect();
280 if tokens.is_empty() {
281 let parser = QueryParser::for_index(&self.index, vec![field]);
282 return Ok(parser.parse_query(query)?);
283 }
284 let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::with_capacity(tokens.len());
285 for tok in tokens {
286 let term = Term::from_field_text(field, tok);
287 let q = FuzzyTermQuery::new(term, fuzzy, true);
288 clauses.push((Occur::Should, Box::new(q) as Box<dyn Query>));
289 }
290 Ok(Box::new(BooleanQuery::new(clauses)))
291 }
292
293 fn apply_layer_filter(&self, text_q: Box<dyn Query>, layers: &[String]) -> Box<dyn Query> {
294 if layers.is_empty() {
295 return text_q;
296 }
297 let mut layer_clauses: Vec<(Occur, Box<dyn Query>)> = Vec::with_capacity(layers.len());
298 for layer in layers {
299 let term = Term::from_field_text(self.schema.kind, layer);
300 let q = TermQuery::new(term, IndexRecordOption::Basic);
301 layer_clauses.push((Occur::Should, Box::new(q) as Box<dyn Query>));
302 }
303 let layer_q: Box<dyn Query> = Box::new(BooleanQuery::new(layer_clauses));
304 Box::new(BooleanQuery::new(vec![
305 (Occur::Must, text_q),
306 (Occur::Must, layer_q),
307 ]))
308 }
309
310 fn hit_from_doc(&self, score: f32, doc: &TantivyDocument) -> Hit {
311 let admin_path: Vec<u64> = doc
312 .get_all(self.schema.admin_path)
313 .filter_map(|v| v.as_u64())
314 .collect();
315 Hit {
316 place_id: doc
317 .get_first(self.schema.place_id)
318 .and_then(|v| v.as_u64())
319 .unwrap_or(0),
320 name: doc
321 .get_first(self.schema.name)
322 .and_then(|v| v.as_str())
323 .unwrap_or("")
324 .to_string(),
325 kind: doc
326 .get_first(self.schema.kind)
327 .and_then(|v| v.as_str())
328 .unwrap_or("")
329 .to_string(),
330 level: doc
331 .get_first(self.schema.level)
332 .and_then(|v| v.as_u64())
333 .unwrap_or(0),
334 lon: doc
335 .get_first(self.schema.lon)
336 .and_then(|v| v.as_f64())
337 .unwrap_or(0.0),
338 lat: doc
339 .get_first(self.schema.lat)
340 .and_then(|v| v.as_f64())
341 .unwrap_or(0.0),
342 score,
343 admin_path,
344 distance_km: None,
345 }
346 }
347}
348
349fn apply_geo_bias(hits: &mut [Hit], focus: Coord, weight: f64) {
350 let weight = weight.max(0.0);
351 for h in hits.iter_mut() {
352 let km = haversine_km(focus.lat, focus.lon, h.lat, h.lon);
353 h.distance_km = Some(km);
354 let blended = (h.score as f64) / (1.0 + weight * km);
355 h.score = blended as f32;
356 }
357 hits.sort_by(|a, b| {
358 b.score
359 .partial_cmp(&a.score)
360 .unwrap_or(std::cmp::Ordering::Equal)
361 });
362}
363
364fn haversine_km(lat1: f64, lon1: f64, lat2: f64, lon2: f64) -> f64 {
365 const EARTH_KM: f64 = 6371.0088;
366 let to_rad = std::f64::consts::PI / 180.0;
367 let phi1 = lat1 * to_rad;
368 let phi2 = lat2 * to_rad;
369 let dphi = (lat2 - lat1) * to_rad;
370 let dlam = (lon2 - lon1) * to_rad;
371 let a = (dphi / 2.0).sin().powi(2) + phi1.cos() * phi2.cos() * (dlam / 2.0).sin().powi(2);
372 2.0 * EARTH_KM * a.sqrt().asin()
373}
374
375#[cfg(test)]
376mod tests {
377 use super::*;
378 use cairn_place::{Coord, LocalizedName, PlaceId};
379
380 fn vaduz() -> Place {
381 Place {
382 id: PlaceId::new(1, 49509, 1).unwrap(),
383 kind: PlaceKind::City,
384 names: vec![
385 LocalizedName {
386 lang: "default".into(),
387 value: "Vaduz".into(),
388 },
389 LocalizedName {
390 lang: "de".into(),
391 value: "Vaduz".into(),
392 },
393 ],
394 centroid: Coord {
395 lon: 9.5209,
396 lat: 47.1410,
397 },
398 admin_path: vec![PlaceId::new(0, 49509, 1).unwrap()],
399 tags: vec![],
400 }
401 }
402
403 fn schaan() -> Place {
404 Place {
405 id: PlaceId::new(1, 49509, 2).unwrap(),
406 kind: PlaceKind::City,
407 names: vec![LocalizedName {
408 lang: "default".into(),
409 value: "Schaan".into(),
410 }],
411 centroid: Coord {
412 lon: 9.5095,
413 lat: 47.1650,
414 },
415 admin_path: vec![PlaceId::new(0, 49509, 1).unwrap()],
416 tags: vec![],
417 }
418 }
419
420 fn liechtenstein_country() -> Place {
421 Place {
422 id: PlaceId::new(0, 49509, 1).unwrap(),
423 kind: PlaceKind::Country,
424 names: vec![LocalizedName {
425 lang: "default".into(),
426 value: "Liechtenstein".into(),
427 }],
428 centroid: Coord {
429 lon: 9.5594,
430 lat: 47.1114,
431 },
432 admin_path: vec![],
433 tags: vec![],
434 }
435 }
436
437 fn tempdir_for_test() -> std::path::PathBuf {
438 use std::sync::atomic::{AtomicUsize, Ordering};
439 static COUNTER: AtomicUsize = AtomicUsize::new(0);
440 let nanos = std::time::SystemTime::now()
441 .duration_since(std::time::UNIX_EPOCH)
442 .unwrap()
443 .as_nanos();
444 let n = COUNTER.fetch_add(1, Ordering::Relaxed);
445 let d = std::env::temp_dir().join(format!(
446 "cairn-text-test-{}-{}-{}",
447 std::process::id(),
448 nanos,
449 n
450 ));
451 std::fs::create_dir_all(&d).unwrap();
452 d
453 }
454
455 #[test]
456 fn build_and_search() {
457 let dir = tempdir_for_test();
458 let docs = build_index(&dir, vec![vaduz(), schaan()]).unwrap();
459 assert_eq!(docs, 2);
460
461 let idx = TextIndex::open(&dir).unwrap();
462 let hits = idx.search("vaduz", &SearchOptions::default()).unwrap();
463 assert_eq!(hits.len(), 1);
464 assert_eq!(hits[0].name, "Vaduz");
465 assert_eq!(hits[0].admin_path.len(), 1);
466 }
467
468 #[test]
469 fn autocomplete_prefix() {
470 let dir = tempdir_for_test();
471 build_index(&dir, vec![vaduz(), schaan()]).unwrap();
472 let idx = TextIndex::open(&dir).unwrap();
473 let opts = SearchOptions {
474 mode: SearchMode::Autocomplete,
475 ..Default::default()
476 };
477
478 let hits = idx.search("Vad", &opts).unwrap();
479 assert!(hits.iter().any(|h| h.name == "Vaduz"));
480 }
481
482 #[test]
483 fn fuzzy_recovers_typo() {
484 let dir = tempdir_for_test();
485 build_index(&dir, vec![vaduz(), schaan()]).unwrap();
486 let idx = TextIndex::open(&dir).unwrap();
487 let opts = SearchOptions {
488 fuzzy: 2,
489 ..Default::default()
490 };
491 let hits = idx.search("vaaduz", &opts).unwrap();
492 assert!(hits.iter().any(|h| h.name == "Vaduz"));
493 }
494
495 #[test]
496 fn layer_filter_excludes_other_kinds() {
497 let dir = tempdir_for_test();
498 build_index(&dir, vec![vaduz(), liechtenstein_country()]).unwrap();
499 let idx = TextIndex::open(&dir).unwrap();
500
501 let only_country = SearchOptions {
502 layers: vec!["country".into()],
503 ..Default::default()
504 };
505 let hits = idx.search("liechtenstein", &only_country).unwrap();
506 assert!(!hits.is_empty());
507 assert!(hits.iter().all(|h| h.kind == "country"));
508
509 let only_city = SearchOptions {
510 layers: vec!["city".into()],
511 ..Default::default()
512 };
513 let hits = idx.search("liechtenstein", &only_city).unwrap();
514 assert!(hits.is_empty(), "country must not leak into city layer");
515 }
516
517 #[test]
518 fn focus_reranks_nearer_first() {
519 let dir = tempdir_for_test();
520 build_index(&dir, vec![vaduz(), schaan()]).unwrap();
521 let idx = TextIndex::open(&dir).unwrap();
522
523 let opts = SearchOptions {
526 mode: SearchMode::Autocomplete,
527 focus: Some(Coord {
528 lon: 9.5095,
529 lat: 47.1650,
530 }),
531 focus_weight: 5.0,
532 limit: 5,
533 ..Default::default()
534 };
535 let hits = idx.search("S", &opts).unwrap();
536 assert!(hits.iter().all(|h| h.distance_km.is_some()));
537 assert!(
538 hits.first().map(|h| h.name == "Schaan").unwrap_or(false),
539 "expected Schaan first, got {:?}",
540 hits
541 );
542 }
543}