1pub mod chunk;
2pub mod embed;
3pub mod extract;
4pub mod search;
5pub mod store;
6pub mod watch;
7
8use std::path::{Path, PathBuf};
9use std::sync::atomic::{AtomicUsize, Ordering};
10
11use anyhow::{Context, Result};
12use rayon::prelude::*;
13use sha2::{Digest, Sha256};
14
15use chunk::{Chunk, ChunkStrategy};
16use extract::ExtractedDoc;
17use store::DocStore;
18
19pub mod links;
20
21pub struct DocIndex {
22 root: PathBuf,
23 db_path: PathBuf,
24 store: Option<DocStore>,
25}
26
27pub struct DocIndexResult {
28 pub total_files: usize,
29 pub indexed_files: usize,
30 pub total_chunks: usize,
31}
32
33impl DocIndex {
34 pub fn open(root: &Path) -> Result<Self> {
35 let tg_dir = root.join(".infigraph");
36 std::fs::create_dir_all(&tg_dir)?;
37 let db_path = tg_dir.join("docs.kuzu");
38 Ok(Self {
39 root: root.to_path_buf(),
40 db_path,
41 store: None,
42 })
43 }
44
45 pub fn init(&mut self) -> Result<()> {
46 let store = DocStore::open(&self.db_path)?;
47 self.store = Some(store);
48 Ok(())
49 }
50
51 pub fn store(&self) -> Option<&DocStore> {
52 self.store.as_ref()
53 }
54
55 pub fn root(&self) -> &Path {
56 &self.root
57 }
58
59 pub fn clean(&mut self) -> Result<()> {
60 self.store = None;
61 let tg_dir = self.root.join(".infigraph");
62 if self.db_path.is_dir() {
63 let _ = std::fs::remove_dir_all(&self.db_path);
64 } else {
65 let _ = std::fs::remove_file(&self.db_path);
66 }
67 let _ = std::fs::remove_file(self.db_path.with_extension("wal"));
68 let _ = std::fs::remove_file(self.db_path.with_extension("lock"));
69 let _ = std::fs::remove_file(tg_dir.join("docs_embeddings.bin"));
70 let _ = std::fs::remove_file(tg_dir.join("docs_hnsw_index.usearch"));
71 let _ = std::fs::remove_file(tg_dir.join("docs_hnsw_index.meta"));
72 infigraph_core::embed::invalidate_embeddings_cache();
73 infigraph_core::embed::invalidate_hnsw_cache();
74 Ok(())
75 }
76
77 pub fn reindex(&mut self) -> Result<DocIndexResult> {
78 self.clean()?;
79 self.init()?;
80 self.index()
81 }
82
83 pub fn index(&self) -> Result<DocIndexResult> {
84 let store = self.store.as_ref().context("call init() first")?;
85
86 let files = self.collect_doc_files()?;
87 let total = files.len();
88
89 if total == 0 {
90 return Ok(DocIndexResult {
91 total_files: 0,
92 indexed_files: 0,
93 total_chunks: 0,
94 });
95 }
96
97 let existing_hashes = store.get_doc_hashes().unwrap_or_default();
98
99 let done = AtomicUsize::new(0);
100
101 let results: Vec<(ExtractedDoc, Vec<Chunk>)> = files
102 .par_iter()
103 .filter_map(|path| {
104 let rel = path
105 .strip_prefix(&self.root)
106 .ok()?
107 .to_string_lossy()
108 .replace('\\', "/");
109 let bytes = std::fs::read(path).ok()?;
110 let hash = {
111 let mut h = Sha256::new();
112 h.update(&bytes);
113 format!("{:x}", h.finalize())
114 };
115
116 let n = done.fetch_add(1, Ordering::Relaxed) + 1;
117 let pct = n * 100 / total;
118 let prev_pct = (n - 1) * 100 / total;
119 if (pct / 25) > (prev_pct / 25) || n == total {
120 eprintln!("Doc indexing: {}/{} ({}%)", n, total, pct);
121 }
122
123 if existing_hashes.get(&rel).map(|s| s.as_str()) == Some(hash.as_str()) {
124 return None;
125 }
126
127 let ext = path.extension()?.to_string_lossy().to_lowercase();
128 let doc = extract::extract_document(path, &bytes, &ext).ok()?;
129 let strategy = ChunkStrategy::for_extension(&ext);
130 let chunks = chunk::chunk_document(&doc, &rel, &hash, strategy);
131 Some((
132 ExtractedDoc {
133 file: rel,
134 content_hash: hash,
135 ..doc
136 },
137 chunks,
138 ))
139 })
140 .collect();
141
142 let indexed = results.len();
143 let total_chunks: usize = results.iter().map(|(_, c)| c.len()).sum();
144
145 if !results.is_empty() {
146 let docs: Vec<&ExtractedDoc> = results.iter().map(|(d, _)| d).collect();
147 let chunks: Vec<&Chunk> = results.iter().flat_map(|(_, c)| c.iter()).collect();
148 store.upsert_all_parquet(&docs, &chunks)?;
149 }
150
151 if total_chunks > 0 {
152 let all_chunks: Vec<&Chunk> = results.iter().flat_map(|(_, c)| c.iter()).collect();
153 let changed_files: Vec<&str> = results.iter().map(|(d, _)| d.file.as_str()).collect();
154 embed::update_doc_embeddings(store, &self.root, &all_chunks, &changed_files)?;
155 }
156
157 if !results.is_empty() {
159 let all_doc_ids: std::collections::HashSet<String> = {
160 let existing = store.get_doc_hashes().unwrap_or_default();
161 existing.keys().cloned().collect()
162 };
163 for (doc, _) in &results {
164 links::extract_and_link_doc(store, doc, &all_doc_ids);
165 }
166 }
167
168 Ok(DocIndexResult {
169 total_files: total,
170 indexed_files: indexed,
171 total_chunks,
172 })
173 }
174
175 fn collect_doc_files(&self) -> Result<Vec<PathBuf>> {
176 let mut files = Vec::new();
177 self.walk_doc_dir(&self.root, &mut files)?;
178 Ok(files)
179 }
180
181 fn walk_doc_dir(&self, dir: &Path, files: &mut Vec<PathBuf>) -> Result<()> {
182 let ignore_dirs = [
183 ".infigraph",
184 ".git",
185 "node_modules",
186 "__pycache__",
187 ".venv",
188 "venv",
189 "target",
190 "build",
191 "dist",
192 ".tox",
193 ];
194
195 for entry in std::fs::read_dir(dir)? {
196 let entry = entry?;
197 let path = entry.path();
198 let name = entry.file_name();
199 let name_str = name.to_string_lossy();
200
201 if path.is_dir() {
202 if !ignore_dirs.contains(&name_str.as_ref()) && !name_str.starts_with('.') {
203 self.walk_doc_dir(&path, files)?;
204 }
205 } else if path.is_file() && is_document_file(&path) {
206 files.push(path);
207 }
208 }
209 Ok(())
210 }
211}
212
213pub fn is_document_file(path: &Path) -> bool {
214 let ext = match path.extension() {
215 Some(e) => e.to_string_lossy().to_lowercase(),
216 None => return false,
217 };
218 matches!(
219 ext.as_str(),
220 "md" | "markdown"
221 | "txt"
222 | "rst"
223 | "adoc"
224 | "org"
225 | "pdf"
226 | "docx"
227 | "pptx"
228 | "xlsx"
229 | "rtf"
230 | "html"
231 | "htm"
232 | "epub"
233 | "xml"
234 | "xsl"
235 | "xsd"
236 | "svg"
237 | "plist"
238 )
239}