Skip to main content

rumdl_lib/lsp/
index_worker.rs

1//! Background worker for workspace index management
2//!
3//! This module provides a background task that manages the workspace index
4//! for cross-file analysis. It handles debouncing rapid file updates and
5//! efficiently updates the index without blocking the main LSP server.
6
7use std::collections::HashMap;
8use std::path::{Path, PathBuf};
9use std::sync::Arc;
10use std::time::{Duration, Instant};
11
12use tokio::sync::{RwLock, mpsc};
13use tower_lsp::Client;
14use tower_lsp::lsp_types::*;
15
16use crate::config::MarkdownFlavor;
17use crate::lint_context::LintContext;
18use crate::lsp::types::{IndexState, IndexUpdate};
19use crate::utils::anchor_styles::AnchorStyle;
20use crate::workspace_index::{FileIndex, HeadingIndex, WorkspaceIndex, extract_cross_file_links};
21
22/// Supported markdown file extensions
23const MARKDOWN_EXTENSIONS: &[&str] = &["md", "markdown", "mdx", "mkd", "mkdn", "mdown", "mdwn", "qmd", "rmd"];
24
25/// Check if a file extension is a markdown extension
26#[inline]
27fn is_markdown_extension(ext: &std::ffi::OsStr) -> bool {
28    ext.to_str()
29        .is_some_and(|s| MARKDOWN_EXTENSIONS.contains(&s.to_lowercase().as_str()))
30}
31
32/// Background worker for managing the workspace index
33///
34/// Receives updates via a channel and maintains the workspace index
35/// with debouncing to avoid excessive re-indexing during rapid edits.
36pub struct IndexWorker {
37    /// Receiver for index update messages
38    rx: mpsc::Receiver<IndexUpdate>,
39    /// The workspace index being maintained
40    workspace_index: Arc<RwLock<WorkspaceIndex>>,
41    /// Current state of the index (building/ready/error)
42    index_state: Arc<RwLock<IndexState>>,
43    /// LSP client for progress reporting
44    client: Client,
45    /// Workspace root folders
46    workspace_roots: Arc<RwLock<Vec<PathBuf>>>,
47    /// Debouncing: path -> (content, last_update_time)
48    pending: HashMap<PathBuf, (String, Instant)>,
49    /// Debounce duration
50    debounce_duration: Duration,
51    /// Sender to request re-linting of files (back to server)
52    relint_tx: mpsc::Sender<PathBuf>,
53}
54
55impl IndexWorker {
56    /// Create a new index worker
57    pub fn new(
58        rx: mpsc::Receiver<IndexUpdate>,
59        workspace_index: Arc<RwLock<WorkspaceIndex>>,
60        index_state: Arc<RwLock<IndexState>>,
61        client: Client,
62        workspace_roots: Arc<RwLock<Vec<PathBuf>>>,
63        relint_tx: mpsc::Sender<PathBuf>,
64    ) -> Self {
65        Self {
66            rx,
67            workspace_index,
68            index_state,
69            client,
70            workspace_roots,
71            pending: HashMap::new(),
72            debounce_duration: Duration::from_millis(100),
73            relint_tx,
74        }
75    }
76
77    /// Run the index worker event loop
78    pub async fn run(mut self) {
79        let mut debounce_interval = tokio::time::interval(Duration::from_millis(50));
80
81        loop {
82            tokio::select! {
83                // Receive updates from main server
84                msg = self.rx.recv() => {
85                    match msg {
86                        Some(IndexUpdate::FileChanged { path, content }) => {
87                            self.pending.insert(path, (content, Instant::now()));
88                        }
89                        Some(IndexUpdate::FileDeleted { path }) => {
90                            self.handle_file_deleted(&path).await;
91                        }
92                        Some(IndexUpdate::FullRescan) => {
93                            self.full_rescan().await;
94                        }
95                        Some(IndexUpdate::Shutdown) | None => {
96                            log::info!("Index worker shutting down");
97                            break;
98                        }
99                    }
100                }
101
102                // Process debounced updates periodically
103                _ = debounce_interval.tick() => {
104                    self.process_pending_updates().await;
105                }
106            }
107        }
108    }
109
110    /// Process pending updates that have been debounced long enough
111    async fn process_pending_updates(&mut self) {
112        let now = Instant::now();
113        let ready: Vec<_> = self
114            .pending
115            .iter()
116            .filter(|(_, (_, time))| now.duration_since(*time) >= self.debounce_duration)
117            .map(|(path, _)| path.clone())
118            .collect();
119
120        for path in ready {
121            if let Some((content, _)) = self.pending.remove(&path) {
122                self.update_single_file(&path, &content).await;
123            }
124        }
125    }
126
127    /// Update a single file in the index
128    async fn update_single_file(&self, path: &Path, content: &str) {
129        // Build FileIndex using LintContext
130        let Ok(file_index) = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| Self::build_file_index(content)))
131        else {
132            log::error!("Panic while indexing {}: skipping", path.display());
133            return;
134        };
135
136        // Get old dependents before updating
137        let old_dependents = {
138            let index = self.workspace_index.read().await;
139            index.get_dependents(path)
140        };
141
142        // Update the index
143        {
144            let mut index = self.workspace_index.write().await;
145            index.update_file(path, file_index);
146        }
147
148        // Get new dependents after updating
149        let new_dependents = {
150            let index = self.workspace_index.read().await;
151            index.get_dependents(path)
152        };
153
154        // Request re-lint of affected files (union of old and new dependents)
155        let mut affected: std::collections::HashSet<PathBuf> = old_dependents.into_iter().collect();
156        affected.extend(new_dependents);
157
158        for dep_path in affected {
159            if self.relint_tx.send(dep_path.clone()).await.is_err() {
160                log::warn!("Failed to send re-lint request for {}", dep_path.display());
161            }
162        }
163    }
164
165    /// Build a FileIndex from content
166    pub(super) fn build_file_index(content: &str) -> FileIndex {
167        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
168        let mut file_index = FileIndex::new();
169
170        // Extract headings from the content
171        for (line_num, line_info) in ctx.lines.iter().enumerate() {
172            if let Some(heading) = &line_info.heading {
173                let auto_anchor = AnchorStyle::GitHub.generate_fragment(&heading.text);
174                let is_setext = matches!(
175                    heading.style,
176                    crate::lint_context::types::HeadingStyle::Setext1
177                        | crate::lint_context::types::HeadingStyle::Setext2
178                );
179
180                file_index.add_heading(HeadingIndex {
181                    text: heading.text.clone(),
182                    auto_anchor,
183                    custom_anchor: heading.custom_id.clone(),
184                    line: line_num + 1, // 1-indexed
185                    is_setext,
186                });
187            }
188        }
189
190        // Extract cross-file links using the shared utility
191        // This ensures consistent position tracking with MD057
192        let links = extract_cross_file_links(&ctx);
193        for link in links.relative {
194            file_index.add_cross_file_link(link);
195        }
196        for link in links.root_relative {
197            file_index.add_root_relative_link(link);
198        }
199
200        file_index
201    }
202
203    /// Handle a file deletion
204    async fn handle_file_deleted(&self, path: &Path) {
205        // Remove pending update for this file
206        // (self.pending is not accessible here directly, but FileDeleted is handled immediately)
207
208        // Get dependents before removing
209        let dependents = {
210            let index = self.workspace_index.read().await;
211            index.get_dependents(path)
212        };
213
214        // Remove from index
215        {
216            let mut index = self.workspace_index.write().await;
217            index.remove_file(path);
218        }
219
220        // Request re-lint of dependent files (they now have broken links)
221        for dep_path in dependents {
222            if self.relint_tx.send(dep_path.clone()).await.is_err() {
223                log::warn!("Failed to send re-lint request for {}", dep_path.display());
224            }
225        }
226    }
227
228    /// Perform a full rescan of the workspace
229    async fn full_rescan(&mut self) {
230        // Clear pending updates
231        self.pending.clear();
232
233        // Find all markdown files in workspace roots
234        let roots = self.workspace_roots.read().await.clone();
235        let files = scan_markdown_files(&roots).await;
236        let total = files.len();
237
238        if total == 0 {
239            *self.index_state.write().await = IndexState::Ready;
240            return;
241        }
242
243        // Set initial building state
244        *self.index_state.write().await = IndexState::Building {
245            progress: 0.0,
246            files_indexed: 0,
247            total_files: total,
248        };
249
250        // Report progress start
251        self.report_progress_begin(total).await;
252
253        // Index each file
254        for (i, path) in files.iter().enumerate() {
255            if let Ok(content) = tokio::fs::read_to_string(path).await {
256                let file_index = Self::build_file_index(&content);
257
258                let mut index = self.workspace_index.write().await;
259                index.update_file(path, file_index);
260            }
261
262            // Report progress every 10 files or at end
263            if i % 10 == 0 || i == total - 1 {
264                let progress = ((i + 1) as f32 / total as f32) * 100.0;
265                *self.index_state.write().await = IndexState::Building {
266                    progress,
267                    files_indexed: i + 1,
268                    total_files: total,
269                };
270                self.report_progress_update(i + 1, total).await;
271            }
272        }
273
274        // Mark as ready
275        *self.index_state.write().await = IndexState::Ready;
276        self.report_progress_done().await;
277
278        log::info!("Workspace indexing complete: {total} files indexed");
279    }
280
281    /// Report progress begin via LSP
282    async fn report_progress_begin(&self, total: usize) {
283        let token = NumberOrString::String("rumdl-index".to_string());
284
285        // Request progress token creation
286        if self
287            .client
288            .send_request::<request::WorkDoneProgressCreate>(WorkDoneProgressCreateParams { token: token.clone() })
289            .await
290            .is_err()
291        {
292            log::debug!("Client does not support work done progress");
293            return;
294        }
295
296        // Send begin notification
297        self.client
298            .send_notification::<notification::Progress>(ProgressParams {
299                token,
300                value: ProgressParamsValue::WorkDone(WorkDoneProgress::Begin(WorkDoneProgressBegin {
301                    title: "Indexing workspace".to_string(),
302                    cancellable: Some(false),
303                    message: Some(format!("Scanning {total} markdown files...")),
304                    percentage: Some(0),
305                })),
306            })
307            .await;
308    }
309
310    /// Report progress update via LSP
311    async fn report_progress_update(&self, indexed: usize, total: usize) {
312        let token = NumberOrString::String("rumdl-index".to_string());
313        let percentage = ((indexed as f32 / total as f32) * 100.0) as u32;
314
315        self.client
316            .send_notification::<notification::Progress>(ProgressParams {
317                token,
318                value: ProgressParamsValue::WorkDone(WorkDoneProgress::Report(WorkDoneProgressReport {
319                    cancellable: Some(false),
320                    message: Some(format!("Indexed {indexed}/{total} files")),
321                    percentage: Some(percentage),
322                })),
323            })
324            .await;
325    }
326
327    /// Report progress done via LSP
328    async fn report_progress_done(&self) {
329        let token = NumberOrString::String("rumdl-index".to_string());
330
331        self.client
332            .send_notification::<notification::Progress>(ProgressParams {
333                token,
334                value: ProgressParamsValue::WorkDone(WorkDoneProgress::End(WorkDoneProgressEnd {
335                    message: Some("Indexing complete".to_string()),
336                })),
337            })
338            .await;
339    }
340}
341
342/// Scan workspace roots for markdown files
343///
344/// Honors `.gitignore`, `.ignore`, git global excludes, and skips hidden
345/// entries so generated/ignored files don't pollute the index. Runs the
346/// (synchronous) filesystem walk on a blocking thread.
347async fn scan_markdown_files(roots: &[PathBuf]) -> Vec<PathBuf> {
348    let roots = roots.to_vec();
349    tokio::task::spawn_blocking(move || collect_markdown_files(&roots))
350        .await
351        .unwrap_or_else(|e| {
352            log::warn!("Workspace scan task failed: {e}");
353            Vec::new()
354        })
355}
356
357/// Build a [`WalkBuilder`](ignore::WalkBuilder) rooted at `root` with the ignore
358/// options used for workspace indexing.
359///
360/// Centralizing the configuration keeps the full scan ([`collect_markdown_files`])
361/// and the single-path check ([`path_is_ignored_for_index`]) in exact agreement
362/// about which files belong in the index: same hidden/gitignore/global/exclude
363/// handling, same `node_modules`/`target` exclusion.
364fn index_walk_builder(root: &Path) -> ignore::WalkBuilder {
365    let mut builder = ignore::WalkBuilder::new(root);
366    builder
367        .hidden(true)
368        .git_ignore(true)
369        .git_global(true)
370        .git_exclude(true)
371        .parents(true)
372        // Honor .gitignore / .ignore even outside a git repository.
373        .require_git(false)
374        // Always skip dependency/build output directories even when not gitignored.
375        .filter_entry(|entry| {
376            let name = entry.file_name().to_str().unwrap_or("");
377            name != "node_modules" && name != "target"
378        });
379    builder
380}
381
382/// Collect markdown files from the given roots, respecting ignore files.
383fn collect_markdown_files(roots: &[PathBuf]) -> Vec<PathBuf> {
384    let mut files = Vec::new();
385
386    for root in roots {
387        for result in index_walk_builder(root).build() {
388            match result {
389                Ok(entry) => {
390                    let path = entry.path();
391                    if entry.file_type().is_some_and(|t| t.is_file())
392                        && let Some(ext) = path.extension()
393                        && is_markdown_extension(ext)
394                    {
395                        files.push(path.to_path_buf());
396                    }
397                }
398                Err(e) => log::warn!("Error scanning {}: {}", root.display(), e),
399            }
400        }
401    }
402
403    files
404}
405
406/// Whether `path` should be excluded from the workspace index based on the same
407/// ignore rules used by the full scan ([`collect_markdown_files`]).
408///
409/// Used to keep filesystem-watch events (`did_change_watched_files`) from
410/// reintroducing generated/ignored files that the full scan skips. Files the
411/// user explicitly opens or edits bypass this check, since the active document
412/// must stay indexed for in-file anchor completion.
413///
414/// Determines ignore status by walking from the containing workspace root down
415/// the chain of directories leading to `path`, using the shared
416/// [`index_walk_builder`] configuration. Descent is pruned to that single chain,
417/// so the walk applies the same ignore rules the full scan would (including an
418/// ignored ancestor directory or a hidden entry) without traversing the tree. If
419/// the walk does not yield `path`, the file must not enter the index.
420///
421/// `node_modules`/`target` are also checked directly so the predicate works even
422/// for paths that do not exist on disk. The file must exist for the walk to
423/// observe it, which holds for the create/change watch events that use this.
424pub(super) fn path_is_ignored_for_index(roots: &[PathBuf], path: &Path) -> bool {
425    // Use the deepest workspace root that contains the file so nested roots
426    // resolve their own ignore files. Paths outside every root aren't filtered.
427    let Some(root) = roots
428        .iter()
429        .filter(|r| path.starts_with(r))
430        .max_by_key(|r| r.components().count())
431    else {
432        return false;
433    };
434
435    // Check `node_modules`/`target` only below the workspace root, so a workspace
436    // located under a directory of that name is not wholly excluded.
437    if let Ok(rel) = path.strip_prefix(root)
438        && rel
439            .components()
440            .any(|c| matches!(c, std::path::Component::Normal(name) if name == "node_modules" || name == "target"))
441    {
442        return true;
443    }
444
445    let target = path.to_path_buf();
446    let mut builder = index_walk_builder(root);
447    // Only descend into directories that lead to `target`; everything else is
448    // pruned. `target.starts_with(entry)` holds for `target` and its ancestors.
449    builder.filter_entry(move |entry| target.starts_with(entry.path()));
450    for entry in builder.build().flatten() {
451        if entry.path() == path {
452            return false;
453        }
454    }
455    true
456}
457
458#[cfg(test)]
459mod tests {
460    use super::*;
461
462    #[test]
463    fn test_build_file_index() {
464        let content = r#"
465# Main Heading
466
467Some text.
468
469## Sub Heading {#sub}
470
471More text with [link](./other.md#section).
472"#;
473
474        let index = IndexWorker::build_file_index(content);
475
476        assert_eq!(index.headings.len(), 2);
477        assert_eq!(index.headings[0].text, "Main Heading");
478        assert!(index.headings[0].custom_anchor.is_none());
479
480        // HeadingInfo.text has the custom ID stripped; the custom_id is stored separately
481        assert_eq!(index.headings[1].text, "Sub Heading");
482        assert_eq!(index.headings[1].custom_anchor, Some("sub".to_string()));
483
484        assert_eq!(index.cross_file_links.len(), 1);
485        assert_eq!(index.cross_file_links[0].target_path, "./other.md");
486        assert_eq!(index.cross_file_links[0].fragment, "section");
487    }
488
489    #[test]
490    fn test_build_file_index_column_positions() {
491        // Verify that column positions are correct (fix for issue #234)
492        let content = "See [link](./file.md) here.\n";
493
494        let index = IndexWorker::build_file_index(content);
495
496        assert_eq!(index.cross_file_links.len(), 1);
497        assert_eq!(index.cross_file_links[0].target_path, "./file.md");
498        assert_eq!(index.cross_file_links[0].line, 1);
499        // "See [link](" = 11 chars, so column 12 is where "./file.md" starts
500        assert_eq!(index.cross_file_links[0].column, 12);
501    }
502
503    #[test]
504    fn test_build_file_index_multiple_links() {
505        let content = "First [a](./a.md) and [b](./b.md#section) links.\n";
506
507        let index = IndexWorker::build_file_index(content);
508
509        assert_eq!(index.cross_file_links.len(), 2);
510
511        // First link: "First [a](" = 10 chars, column 11
512        assert_eq!(index.cross_file_links[0].target_path, "./a.md");
513        assert_eq!(index.cross_file_links[0].column, 11);
514
515        // Second link: "First [a](./a.md) and [b](" = 26 chars, column 27
516        assert_eq!(index.cross_file_links[1].target_path, "./b.md");
517        assert_eq!(index.cross_file_links[1].fragment, "section");
518        assert_eq!(index.cross_file_links[1].column, 27);
519    }
520
521    #[test]
522    fn test_collect_markdown_files_respects_gitignore() {
523        use std::fs;
524
525        let dir = tempfile::tempdir().unwrap();
526        let root = dir.path();
527
528        // A tracked markdown file and a build-output one that .gitignore excludes.
529        fs::write(root.join("README.md"), "# Readme\n").unwrap();
530        fs::write(root.join(".gitignore"), "build/\nignored.md\n").unwrap();
531        fs::write(root.join("ignored.md"), "# Ignored\n").unwrap();
532        fs::create_dir(root.join("build")).unwrap();
533        fs::write(root.join("build").join("generated.md"), "# Generated\n").unwrap();
534
535        // Dependency/output dirs are skipped even when not gitignored.
536        fs::create_dir(root.join("node_modules")).unwrap();
537        fs::write(root.join("node_modules").join("dep.md"), "# Dep\n").unwrap();
538
539        let mut files = collect_markdown_files(&[root.to_path_buf()]);
540        files.sort();
541
542        let names: Vec<String> = files
543            .iter()
544            .map(|p| p.file_name().unwrap().to_str().unwrap().to_string())
545            .collect();
546
547        assert_eq!(names, vec!["README.md".to_string()]);
548    }
549
550    #[test]
551    fn test_collect_markdown_files_finds_nested_markdown() {
552        use std::fs;
553
554        let dir = tempfile::tempdir().unwrap();
555        let root = dir.path();
556
557        fs::write(root.join("top.md"), "# Top\n").unwrap();
558        fs::create_dir(root.join("docs")).unwrap();
559        fs::write(root.join("docs").join("guide.markdown"), "# Guide\n").unwrap();
560        fs::write(root.join("docs").join("notes.txt"), "not markdown\n").unwrap();
561
562        let mut names: Vec<String> = collect_markdown_files(&[root.to_path_buf()])
563            .iter()
564            .map(|p| p.file_name().unwrap().to_str().unwrap().to_string())
565            .collect();
566        names.sort();
567
568        assert_eq!(names, vec!["guide.markdown".to_string(), "top.md".to_string()]);
569    }
570
571    #[test]
572    fn test_path_is_ignored_for_index() {
573        use std::fs;
574
575        let dir = tempfile::tempdir().unwrap();
576        let root = dir.path().to_path_buf();
577        fs::write(root.join(".gitignore"), "build/\ndraft.md\n").unwrap();
578
579        // The check walks the file's directory, so the files must exist (as they
580        // do for create/change watch events).
581        fs::write(root.join("README.md"), "").unwrap();
582        fs::write(root.join("draft.md"), "").unwrap();
583        fs::write(root.join(".hidden.md"), "").unwrap();
584        fs::create_dir(root.join("docs")).unwrap();
585        fs::write(root.join("docs").join("guide.md"), "").unwrap();
586        fs::create_dir(root.join("build")).unwrap();
587        fs::write(root.join("build").join("out.md"), "").unwrap();
588
589        let roots = vec![root.clone()];
590
591        // Tracked files are not ignored.
592        assert!(!path_is_ignored_for_index(&roots, &root.join("README.md")));
593        assert!(!path_is_ignored_for_index(&roots, &root.join("docs/guide.md")));
594
595        // Gitignored file and file inside a gitignored directory.
596        assert!(path_is_ignored_for_index(&roots, &root.join("draft.md")));
597        assert!(path_is_ignored_for_index(&roots, &root.join("build/out.md")));
598
599        // Hidden files are excluded, matching the full scan's hidden(true).
600        assert!(path_is_ignored_for_index(&roots, &root.join(".hidden.md")));
601
602        // Dependency/output dirs are always skipped, even without a gitignore rule
603        // and without the file existing.
604        assert!(path_is_ignored_for_index(&roots, &root.join("node_modules/dep.md")));
605        assert!(path_is_ignored_for_index(&roots, &root.join("target/doc.md")));
606
607        // Paths outside every workspace root are not filtered.
608        let outside = dir.path().parent().unwrap().join("elsewhere.md");
609        assert!(!path_is_ignored_for_index(&roots, &outside));
610    }
611
612    #[test]
613    fn test_path_is_ignored_for_index_honors_nested_gitignore() {
614        use std::fs;
615
616        let dir = tempfile::tempdir().unwrap();
617        let root = dir.path().to_path_buf();
618        fs::create_dir(root.join("docs")).unwrap();
619        fs::write(root.join("docs").join(".gitignore"), "generated.md\n").unwrap();
620        fs::write(root.join("docs").join("generated.md"), "").unwrap();
621        fs::write(root.join("docs").join("manual.md"), "").unwrap();
622
623        let roots = vec![root.clone()];
624
625        assert!(path_is_ignored_for_index(&roots, &root.join("docs/generated.md")));
626        assert!(!path_is_ignored_for_index(&roots, &root.join("docs/manual.md")));
627    }
628
629    #[test]
630    fn test_path_is_ignored_for_index_workspace_under_target_dir() {
631        use std::fs;
632
633        // A workspace whose own path contains a `target` component must not have
634        // all of its files treated as ignored.
635        let dir = tempfile::tempdir().unwrap();
636        let root = dir.path().join("target").join("my-docs");
637        fs::create_dir_all(&root).unwrap();
638        fs::write(root.join("README.md"), "").unwrap();
639        fs::create_dir(root.join("target")).unwrap();
640        fs::write(root.join("target").join("out.md"), "").unwrap();
641
642        let roots = vec![root.clone()];
643
644        // Files directly under the workspace are indexed despite the `target`
645        // ancestor in the absolute path.
646        assert!(!path_is_ignored_for_index(&roots, &root.join("README.md")));
647        // A `target` directory *inside* the workspace is still excluded.
648        assert!(path_is_ignored_for_index(&roots, &root.join("target/out.md")));
649    }
650}