Skip to main content

llm_wiki/ops/
suggest.rs

1use std::collections::{HashMap, HashSet};
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use tantivy::schema::Value;
6
7use crate::engine::EngineState;
8use crate::graph::{GraphFilter, get_cached_community_map, get_or_build_graph};
9use crate::search;
10use crate::slug::{Slug, WikiUri};
11
12/// A page suggested as a related link for a given slug.
13#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct Suggestion {
15    /// Slug of the suggested page.
16    pub slug: String,
17    /// `wiki://` URI of the suggested page.
18    pub uri: String,
19    /// Display title of the suggested page.
20    pub title: String,
21    /// Frontmatter type of the suggested page.
22    pub r#type: String,
23    /// Relevance score (higher is more relevant).
24    pub score: f32,
25    /// Human-readable reason for the suggestion.
26    pub reason: String,
27    /// Index field that triggered the suggestion.
28    pub field: String,
29}
30
31/// Return a ranked list of related-page suggestions for a given slug or URI.
32pub fn suggest(
33    engine: &EngineState,
34    slug_or_uri: &str,
35    wiki_flag: Option<&str>,
36    limit: Option<usize>,
37) -> Result<Vec<Suggestion>> {
38    let (wiki_name, slug) = if slug_or_uri.starts_with("wiki://") {
39        let (entry, slug) = WikiUri::resolve(slug_or_uri, wiki_flag, &engine.config)?;
40        (entry.name, slug)
41    } else {
42        let wiki_name = engine.resolve_wiki_name(wiki_flag).to_string();
43        let slug = Slug::try_from(slug_or_uri)?;
44        (wiki_name, slug)
45    };
46
47    let space = engine.space(&wiki_name)?;
48    let resolved = space.resolved_config(&engine.config);
49    let limit = limit.unwrap_or(resolved.suggest.default_limit as usize);
50    let min_score = resolved.suggest.min_score;
51
52    let searcher = space.index_manager.searcher()?;
53    let is = &space.index_schema;
54
55    // Read the input page to get its tags, type, and existing links
56    let input_doc = find_doc_by_slug(&searcher, is, slug.as_str())?;
57    let input_tags: HashSet<String> = input_doc.tags.iter().cloned().collect();
58    let input_type = input_doc.page_type.clone();
59    let existing_links: HashSet<String> = input_doc.links.iter().cloned().collect();
60
61    let mut candidates: HashMap<String, CandidateScore> = HashMap::new();
62
63    // Strategy 1: Tag overlap
64    for tag in &input_tags {
65        let results = search::search(
66            tag,
67            &search::SearchOptions {
68                no_excerpt: true,
69                top_k: 20,
70                ..Default::default()
71            },
72            &searcher,
73            &wiki_name,
74            is,
75        )?;
76        for r in &results.results {
77            if r.slug == slug.as_str() || existing_links.contains(&r.slug) {
78                continue;
79            }
80            let doc = find_doc_by_slug(&searcher, is, &r.slug)?;
81            let shared: usize = doc.tags.iter().filter(|t| input_tags.contains(*t)).count();
82            if shared == 0 {
83                continue;
84            }
85            let total = doc.tags.len().max(1);
86            let score = shared as f32 / total as f32;
87            let shared_tags: Vec<&str> = doc
88                .tags
89                .iter()
90                .filter(|t| input_tags.contains(*t))
91                .map(|s| s.as_str())
92                .collect();
93            let reason = format!("shares tags: {}", shared_tags.join(", "));
94            candidates
95                .entry(r.slug.clone())
96                .and_modify(|c| {
97                    if score > c.score {
98                        c.score = score;
99                        c.reason = reason.clone();
100                    }
101                })
102                .or_insert(CandidateScore {
103                    slug: r.slug.clone(),
104                    title: r.title.clone(),
105                    page_type: doc.page_type.clone(),
106                    score,
107                    reason,
108                });
109        }
110    }
111
112    // Strategy 2: Graph neighborhood (2 hops)
113    let wiki_graph = get_or_build_graph(
114        is,
115        &space.type_registry,
116        &space.index_manager,
117        &space.graph_cache,
118        &searcher,
119        &GraphFilter::default(),
120    )?;
121    let slug_to_idx: HashMap<&str, _> = wiki_graph
122        .node_indices()
123        .map(|idx| (wiki_graph[idx].slug.as_str(), idx))
124        .collect();
125
126    if let Some(&root_idx) = slug_to_idx.get(slug.as_str()) {
127        // Collect 1-hop and 2-hop neighbors
128        let mut hop1: HashSet<petgraph::graph::NodeIndex> = HashSet::new();
129        for neighbor in wiki_graph.neighbors_undirected(root_idx) {
130            hop1.insert(neighbor);
131        }
132        for &n1 in &hop1 {
133            for n2 in wiki_graph.neighbors_undirected(n1) {
134                if n2 == root_idx || hop1.contains(&n2) {
135                    continue;
136                }
137                let node = &wiki_graph[n2];
138                if existing_links.contains(&node.slug) {
139                    continue;
140                }
141                let via = &wiki_graph[n1].slug;
142                let score = 0.5; // 2 hops
143                let reason = format!("2 hops via {via}");
144                candidates
145                    .entry(node.slug.clone())
146                    .and_modify(|c| {
147                        if score > c.score {
148                            c.score = score;
149                            c.reason = reason.clone();
150                        }
151                    })
152                    .or_insert(CandidateScore {
153                        slug: node.slug.clone(),
154                        title: node.title.clone(),
155                        page_type: node.r#type.clone(),
156                        score,
157                        reason,
158                    });
159            }
160        }
161    }
162
163    // Strategy 3: BM25 similarity (title + summary as query)
164    let query = format!("{} {}", input_doc.title, input_doc.summary);
165    if !query.trim().is_empty() {
166        let results = search::search(
167            &query,
168            &search::SearchOptions {
169                no_excerpt: true,
170                top_k: 10,
171                ..Default::default()
172            },
173            &searcher,
174            &wiki_name,
175            is,
176        )?;
177        let max_score = results
178            .results
179            .first()
180            .map(|r| r.score)
181            .unwrap_or(1.0)
182            .max(0.001);
183        for r in &results.results {
184            if r.slug == slug.as_str() || existing_links.contains(&r.slug) {
185                continue;
186            }
187            let score = r.score / max_score * 0.7; // normalize and weight
188            let reason = "similar content".to_string();
189            candidates
190                .entry(r.slug.clone())
191                .and_modify(|c| {
192                    if score > c.score {
193                        c.score = score;
194                        c.reason = reason.clone();
195                    }
196                })
197                .or_insert_with(|| {
198                    let doc = find_doc_by_slug(&searcher, is, &r.slug).unwrap_or_default();
199                    CandidateScore {
200                        slug: r.slug.clone(),
201                        title: r.title.clone(),
202                        page_type: doc.page_type,
203                        score,
204                        reason,
205                    }
206                });
207        }
208    }
209
210    // Strategy 4: Community peers (same Louvain community, not already linked)
211    if let Some(community_map) = get_cached_community_map(
212        &space.index_schema,
213        &space.type_registry,
214        &space.index_manager,
215        &space.graph_cache,
216        &space.community_cache,
217        &searcher,
218        resolved.graph.min_nodes_for_communities,
219    )? && let Some(&my_community) = community_map.get(slug.as_str())
220    {
221        let mut peers: Vec<&str> = community_map
222            .keys()
223            .filter(|s| {
224                let ns: &str = s;
225                community_map.get(ns).copied() == Some(my_community)
226                    && ns != slug.as_str()
227                    && !existing_links.contains(ns)
228                    && !candidates.contains_key(ns)
229            })
230            .map(|s| s.as_str())
231            .collect();
232        peers.sort_unstable();
233        for (added, node_slug) in peers.into_iter().enumerate() {
234            if added >= resolved.graph.community_suggestions_limit {
235                break;
236            }
237            let doc = find_doc_by_slug(&searcher, is, node_slug)?;
238            candidates.insert(
239                node_slug.to_string(),
240                CandidateScore {
241                    slug: node_slug.to_string(),
242                    title: doc.title.clone(),
243                    page_type: doc.page_type.clone(),
244                    score: 0.4,
245                    reason: "same knowledge cluster".to_string(),
246                },
247            );
248        }
249    }
250
251    // Rank, filter, cap
252    let mut ranked: Vec<CandidateScore> = candidates.into_values().collect();
253    ranked.sort_by(|a, b| {
254        b.score
255            .partial_cmp(&a.score)
256            .unwrap_or(std::cmp::Ordering::Equal)
257    });
258    ranked.retain(|c| c.score >= min_score);
259    ranked.truncate(limit);
260
261    // Build suggestions with edge field
262    let suggestions = ranked
263        .into_iter()
264        .map(|c| {
265            let field = suggest_field(&input_type, &c.page_type, &space.type_registry);
266            Suggestion {
267                uri: format!("wiki://{wiki_name}/{}", c.slug),
268                slug: c.slug,
269                title: c.title,
270                r#type: c.page_type,
271                score: (c.score * 100.0).round() / 100.0,
272                reason: c.reason,
273                field,
274            }
275        })
276        .collect();
277
278    Ok(suggestions)
279}
280
281// ── Helpers ───────────────────────────────────────────────────────────────────
282
283#[derive(Default)]
284struct DocInfo {
285    title: String,
286    summary: String,
287    page_type: String,
288    tags: Vec<String>,
289    links: Vec<String>,
290}
291
292struct CandidateScore {
293    slug: String,
294    title: String,
295    page_type: String,
296    score: f32,
297    reason: String,
298}
299
300fn find_doc_by_slug(
301    searcher: &tantivy::Searcher,
302    is: &crate::index_schema::IndexSchema,
303    slug: &str,
304) -> Result<DocInfo> {
305    let f_slug = is.field("slug");
306    let f_title = is.field("title");
307    let f_type = is.field("type");
308
309    let query = tantivy::query::TermQuery::new(
310        tantivy::Term::from_field_text(f_slug, slug),
311        tantivy::schema::IndexRecordOption::Basic,
312    );
313    let results = searcher.search(
314        &query,
315        &tantivy::collector::TopDocs::with_limit(1).order_by_score(),
316    )?;
317
318    if let Some((_score, addr)) = results.first() {
319        let doc: tantivy::TantivyDocument = searcher.doc(*addr)?;
320        let title = doc
321            .get_first(f_title)
322            .and_then(|v| v.as_str())
323            .unwrap_or("")
324            .to_string();
325        let page_type = doc
326            .get_first(f_type)
327            .and_then(|v| v.as_str())
328            .unwrap_or("")
329            .to_string();
330        let summary = is
331            .try_field("summary")
332            .and_then(|f| doc.get_first(f))
333            .and_then(|v| v.as_str())
334            .unwrap_or("")
335            .to_string();
336        let tags: Vec<String> = is
337            .try_field("tags")
338            .map(|f| {
339                doc.get_all(f)
340                    .filter_map(|v| v.as_str().map(|s| s.to_string()))
341                    .collect()
342            })
343            .unwrap_or_default();
344
345        // Collect existing links from sources, concepts, body_links
346        let mut links = Vec::new();
347        for field_name in &["sources", "concepts", "body_links", "document_refs"] {
348            if let Some(f) = is.try_field(field_name) {
349                for val in doc.get_all(f) {
350                    if let Some(s) = val.as_str() {
351                        links.push(s.to_string());
352                    }
353                }
354            }
355        }
356
357        Ok(DocInfo {
358            title,
359            summary,
360            page_type,
361            tags,
362            links,
363        })
364    } else {
365        Ok(DocInfo::default())
366    }
367}
368
369fn suggest_field(
370    page_type: &str,
371    candidate_type: &str,
372    registry: &crate::type_registry::SpaceTypeRegistry,
373) -> String {
374    let source_types = [
375        "paper",
376        "article",
377        "documentation",
378        "clipping",
379        "transcript",
380        "note",
381        "data",
382        "book-chapter",
383        "thread",
384    ];
385    let is_source = |t: &str| source_types.contains(&t);
386
387    for edge in registry.edges(page_type) {
388        let targets = &edge.target_types;
389        if targets.iter().any(|t| t == candidate_type) {
390            return edge.field.clone();
391        }
392        if is_source(candidate_type) && targets.iter().any(|t| is_source(t)) {
393            return edge.field.clone();
394        }
395    }
396
397    "[[wikilink]]".to_string()
398}