1use std::{collections::HashMap, env::temp_dir, io::BufReader, path::Path};
2
3use anyhow::{anyhow, Context, Result};
4use crates_io_api::AsyncClient;
5use guppy::{graph::PackageGraph, MetadataCommand};
6use rayon::iter::{IntoParallelRefIterator as _, ParallelIterator as _};
7use ruggle_engine::{
8 build_parent_index,
9 query::parse::parse_query,
10 search::{Hit, Scope},
11 types::{self, Crate, CrateMetadata},
12 Index, Parent,
13};
14use ruggle_util::shake;
15
16use serde::Deserialize as _;
17use std::io::Read;
18use tokio::{fs::OpenOptions, process::Command};
19use tokio::{
20 fs::{self},
21 io::copy,
22};
23use tracing::{debug, error, info, warn};
24
25pub fn perform_search(
26 index: &Index,
27 scopes: &Scopes,
28 query_str: &str,
29 scope_str: &str,
30 limit: Option<usize>,
31 threshold: Option<f32>,
32) -> anyhow::Result<Vec<Hit>> {
33 tracing::info!(
34 "performing search for query `{}` in scope `{}`",
35 query_str,
36 scope_str
37 );
38
39 tracing::debug!("available scopes: {:?}", scopes.sets.keys());
40 tracing::debug!("available crates: {:?}", scopes.krates.keys());
41 let scope = match scope_str.split(':').collect::<Vec<_>>().as_slice() {
42 ["set", set] => scopes
43 .sets
44 .get(*set)
45 .context(format!("set `{}` not found", set))?,
46 ["crate", krate, version] => scopes
47 .krates
48 .get(&CrateMetadata {
49 name: krate.to_string(),
50 version: version.to_string(),
51 })
52 .context(format!("krate `{}:{}` not found", krate, version))?,
53 ["crate", krate] => scopes
54 .krates
55 .get(&CrateMetadata::new(krate.to_string()))
56 .context(format!("krate `{}` not found", krate))?,
57
58 _ => Err(anyhow!("parsing scope `{}` failed", scope_str))?,
59 };
60 debug!(?scope);
61
62 let query = parse_query(query_str)
63 .ok()
64 .context(format!("parsing query `{}` failed", query_str))?
65 .1;
66 debug!(?query);
67
68 let limit = limit.unwrap_or(30);
69 let threshold = threshold.unwrap_or(0.4);
70
71 let hits = index
72 .search(&query, scope.clone(), threshold)
73 .with_context(|| format!("search with query `{:?}` failed", query))?;
74 let hits = hits
75 .into_iter()
76 .inspect(|hit| debug!(?hit.name, link = ?hit.link, similarities = ?hit.similarities(), score = ?hit.similarities().score()))
77 .take(limit)
78 .collect::<Vec<_>>();
79
80 Ok(hits)
81}
82
83pub async fn make_index(index_dir: &Path) -> Result<Index> {
84 let crate_dir = index_dir.join("crate");
85 info!("building index from {}", crate_dir.display());
86
87 let mut entries = vec![];
89 let mut dir = fs::read_dir(&crate_dir)
90 .await
91 .context("failed to read index files")?;
92 while let Some(entry) = dir
93 .next_entry()
94 .await
95 .context("failed to read index files")?
96 {
97 let path = entry.path();
98 if path.extension().and_then(|e| e.to_str()) == Some("json") {
100 let bin_path = path.with_extension("bin");
101 if bin_path.exists() {
102 continue;
103 }
104 }
105 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
107 if ext == "json" || ext == "bin" {
108 entries.push(path);
109 }
110 }
111 }
112
113 info!("found {} crate files", entries.len());
114
115 let t_start = std::time::Instant::now();
116
117 let crates: HashMap<CrateMetadata, _> = entries
119 .par_iter()
120 .filter_map(|path| {
121 if path
123 .file_name()
124 .and_then(|f| f.to_str())
125 .map(|f| f.ends_with(".parents.bin"))
126 .unwrap_or(false)
127 {
128 return None;
129 }
130 let file = std::fs::File::open(path).ok()?;
131 let mut reader = BufReader::new(file);
132
133 let ext = path.extension().and_then(|e| e.to_str());
134
135 let t0 = std::time::Instant::now();
136 let krate: Result<Crate> = match ext {
137 Some("bin") => {
138 bincode::decode_from_reader(&mut reader, bincode::config::standard())
139 .with_context(|| format!("Failed to bincode::decode {}", path.display()))
140 }
141 _ => serde_json::from_reader(&mut reader)
142 .map_err(|e| {
143 eprintln!(
144 "error while serde_json::from_reader({}) => {e:?}",
145 path.display()
146 );
147 e
148 })
149 .with_context(|| {
150 format!("Failed to serde_json::from_reader {}", path.display())
151 }),
152 };
153 if let Err(ref e) = krate {
154 warn!("deserializing {:?} failed: {}", path.display(), e);
155 return None;
156 }
157 let mut krate = krate.unwrap();
158 let krate_name: String = path.file_stem()?.to_str()?.to_owned();
159 krate.name = Some(krate_name.clone());
160
161 debug!("deserialized {:?} in {:?}", path.display(), t0.elapsed());
162 let krate_metadata = CrateMetadata {
163 name: krate_name,
164 version: krate.crate_version.clone(),
165 };
166 Some((krate_metadata, krate))
168 })
169 .collect();
170
171 let parents: HashMap<CrateMetadata, HashMap<types::Id, Parent>> = crates
172 .par_iter()
173 .map(|(krate_name, krate)| {
174 let parents_path = crate_dir.join(format!("{}.parents.bin", krate_name));
176 if parents_path.exists() {
177 let file = std::fs::File::open(&parents_path)
178 .expect("parents index file existence was already checked");
179 let mut reader = BufReader::new(file);
180 let parent_map: HashMap<types::Id, Parent> =
181 bincode::decode_from_reader(&mut reader, bincode::config::standard())
182 .expect("decoding parents index from bin failed");
183 return (krate_name.clone(), parent_map);
184 }
185 let parent_map = build_parent_index(krate);
187 let mut file =
189 std::fs::File::create(&parents_path).expect("creating parents index file failed");
190 bincode::encode_into_std_write(&parent_map, &mut file, bincode::config::standard())
191 .expect("encoding parents index to bin failed");
192 tracing::debug!("serialized parents index to {:?}", parents_path);
193 (krate_name.clone(), parent_map)
194 })
195 .collect();
196
197 let total_time = t_start.elapsed();
198 info!(
199 "loaded {} crates in {:.2?} (avg {:.1?} each)",
200 crates.len(),
201 total_time,
202 total_time / (crates.len().max(1) as u32)
203 );
204
205 Ok(Index { crates, parents })
206}
207
208fn dir_size(path: &std::path::Path) -> u64 {
209 std::fs::read_dir(path)
210 .unwrap()
211 .filter_map(|e| e.ok())
212 .map(|e| std::fs::metadata(e.path()).map(|m| m.len()).unwrap_or(0))
213 .sum()
214}
215
216pub fn shake_index(index_dir: &Path) -> Result<()> {
217 let before = dir_size(&index_dir.join("crate"));
219 let result = std::fs::read_dir(format!("{}/crate", index_dir.display()))
220 .context("failed to read index files")?
221 .map(|entry| {
222 let entry = entry?;
223 let json = std::fs::read_to_string(entry.path())
224 .with_context(|| format!("failed to read `{:?}`", entry.file_name()))?;
225 let mut deserializer = serde_json::Deserializer::from_str(&json);
226 deserializer.disable_recursion_limit();
227 let krate = rustdoc_types::Crate::deserialize(&mut deserializer)
228 .with_context(|| format!("failed to deserialize `{:?}`", entry.file_name()))?;
229 let file_name = entry
230 .path()
231 .with_extension("")
232 .file_name()
233 .with_context(|| format!("failed to get file name from `{:?}`", entry.path()))?
234 .to_str()
235 .context("failed to get `&str` from `&OsStr`")?
236 .to_owned();
237 let krate = shake(krate);
238
239 let json = serde_json::to_string(&krate)
240 .with_context(|| format!("failed to serialize crate `{}`", &file_name))?;
241 std::fs::write(
242 format!("{}/crate/{}.json", index_dir.display(), file_name),
243 json,
244 )
245 .with_context(|| format!("failed to write crate `{}`", &file_name))?;
246
247 Ok(())
248 })
249 .collect::<Result<Vec<()>>>();
250 let after = dir_size(&index_dir.join("crate"));
252 tracing::info!(
253 "index shaken: {:.2} MB → {:.2} MB (−{:.2} MB, {:.1}% smaller)",
254 before as f64 / 1_048_576.0,
255 after as f64 / 1_048_576.0,
256 (before - after) as f64 / 1_048_576.0,
257 (before - after) as f64 / before as f64 * 100.0
258 );
259
260 result.map(|_| ())
261}
262
263pub fn generate_bin_index(index_dir: &Path) -> Result<()> {
264 let _result = std::fs::read_dir(format!("{}/crate", index_dir.display()))
265 .context("failed to read index files")?
266 .map(|entry| {
267 let entry = entry?;
268 if entry.path().extension().and_then(|e| e.to_str()) == Some("bin") {
269 tracing::debug!(
271 "skipping already generated bin file {:?}",
272 entry.file_name()
273 );
274 return Ok(());
275 }
276 let json = std::fs::read_to_string(entry.path())
277 .with_context(|| format!("failed to read `{:?}`", entry.file_name()))?;
278 let mut deserializer = serde_json::Deserializer::from_str(&json);
279 deserializer.disable_recursion_limit();
280 tracing::debug!("generating bin for {:?}", entry.file_name());
281
282 let krate = Crate::deserialize(&mut deserializer);
283
284 let Ok(krate) = krate else {
285 warn!(
286 "deserializing {:?} failed: {}",
287 entry.file_name(),
288 krate.unwrap_err()
289 );
290 return Ok(());
291 };
292
293 let file_name = entry
294 .path()
295 .with_extension("")
296 .file_name()
297 .with_context(|| format!("failed to get file name from `{:?}`", entry.path()))?
298 .to_str()
299 .context("failed to get `&str` from `&OsStr`")?
300 .to_owned();
301
302 let mut file = std::fs::File::create(format!(
303 "{}/crate/{}.bin",
304 index_dir.display(),
305 file_name
306 ))
307 .with_context(|| format!("failed to create bin file for crate `{}`", &file_name))?;
308 bincode::encode_into_std_write(&krate, &mut file, bincode::config::standard())
309 .with_context(|| format!("failed to serialize crate `{}` to bin", &file_name))?;
310
311 Ok(())
312 })
313 .collect::<Result<Vec<()>>>();
314
315 Ok(())
316}
317
318pub struct Scopes {
319 pub sets: HashMap<String, Scope>,
320 pub krates: HashMap<CrateMetadata, Scope>,
321}
322
323pub fn make_sets(index_dir: &Path) -> HashMap<String, Scope> {
324 match std::fs::read_dir(format!("{}/set", index_dir.display())) {
325 Err(e) => {
326 warn!("registering sets skipped: {}", e);
327 HashMap::default()
328 }
329 Ok(entry) => {
330 entry
331 .map(|entry| {
332 let entry = entry?;
333 let path = entry.path();
334 let json = std::fs::read_to_string(&path)
335 .context(format!("failed to read `{:?}`", path))?;
336 let set = path.file_stem().unwrap().to_str().unwrap().to_owned(); let krates = serde_json::from_str::<Vec<CrateMetadata>>(&json)
338 .context(format!("failed to deserialize set `{}`", &set))?;
339
340 Ok((set.clone(), Scope::Set(set, krates)))
341 })
342 .filter_map(|res: Result<_, anyhow::Error>| {
343 if let Err(ref e) = res {
344 warn!("registering a scope skipped: {}", e)
345 }
346 res.ok()
347 })
348 .collect()
349 }
350 }
351}
352
353pub async fn pull_crate_from_docs_rs(metadata: &types::CrateMetadata) -> Result<types::Crate> {
354 info!("checking docs.rs for crate: {}", &metadata.name);
355 let url = format!(
356 "https://docs.rs/crate/{}/{}/json",
357 metadata.name, metadata.version
358 );
359 debug!("docs.rs url for {}: {}", metadata.name, url);
360
361 let client = reqwest::Client::new();
362 let response = client.get(&url).send().await?;
363 debug!("response status: {}", response.status());
364 if response.status().is_success() {
365 debug!("docs.rs url for {}: {}", metadata.name, url);
366 debug!("response: {:?}", response);
367 let zst_encoded_krate = response.bytes().await?;
368 let mut decoder = ruzstd::decoding::StreamingDecoder::new(&zst_encoded_krate[..]).unwrap();
369 let mut json_encoded_krate = Vec::new();
370 decoder
371 .read_to_end(&mut json_encoded_krate)
372 .with_context(|| format!("Failed to create zstd decoder for {}", url))?;
373
374 let mut krate: types::Crate = serde_json::from_slice(&json_encoded_krate)
375 .with_context(|| format!("Failed to serde_json::from_slice {}", url))?;
376 krate.name = Some(metadata.name.clone());
377 info!("fetched crate {} from docs.rs", metadata);
378 return Ok(krate);
379 }
380
381 Err(anyhow::anyhow!("crate {} not found on docs.rs", metadata))
382}
383
384#[cfg(test)]
385mod tests {
386 use super::*;
387
388 #[tokio::test]
389 async fn test_pull_crate_from_docs_rs() {
390 tracing_subscriber::fmt::fmt()
391 .with_max_level(tracing::Level::DEBUG)
392 .with_test_writer()
393 .init();
394 let krate = types::CrateMetadata {
395 name: "serde".into(),
396 version: "latest".into(),
397 };
398 let result = pull_crate_from_docs_rs(&krate).await;
399 assert!(result.is_ok());
400 }
401}
402
403pub async fn pull_crate_from_remote_index(
404 krate_metadata: &types::CrateMetadata,
405) -> Result<types::Crate> {
406 info!("checking remote index for crate: {}", &krate_metadata.name);
407 let bin_url = format!(
408 "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/crate/{}.bin",
409 krate_metadata.name
410 );
411 let json_url = format!(
412 "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/crate/{}.json",
413 krate_metadata.name,
415 );
417
418 let client = reqwest::Client::new();
419
420 debug!(".bin url for {}: {}", krate_metadata, bin_url);
422 let response = client.get(&bin_url).send().await?;
423 if response.status().is_success() {
424 let bytes = response.bytes().await?;
425 if let Ok((krate, _)) =
426 bincode::decode_from_slice::<types::Crate, _>(&bytes, bincode::config::standard())
427 {
428 info!("fetched crate {} from remote index (.bin)", krate_metadata);
429 return Ok(krate);
430 }
431 }
432 tracing::debug!(
433 "crate {} not found in remote index (.bin), trying .json",
434 krate_metadata
435 );
436
437 debug!(".json url for {}: {}", krate_metadata, json_url);
439 let response = client.get(&json_url).send().await?;
440 if response.status().is_success() {
441 println!("response: {:?}", response);
442 let text = response.text().await?;
444 let mut krate: types::Crate = serde_json::from_str(&text)
445 .with_context(|| format!("Failed to serde_json::from_str {}", json_url))?;
446 krate.name = Some(krate_metadata.name.clone());
447 info!(
448 "fetched crate {} from remote index (.json)",
449 krate_metadata.name
450 );
451 return Ok(krate);
452 }
453
454 Err(anyhow::anyhow!(
455 "crate {} not found in remote index",
456 krate_metadata
457 ))
458}
459
460pub async fn pull_set_from_remote_index(set_name: &str) -> Result<Vec<CrateMetadata>> {
461 info!("fetching set {} from remote index", set_name);
462 let json_url = format!(
463 "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/set/{}.json",
464 set_name
465 );
466
467 let client = reqwest::Client::new();
468 let response = client.get(&json_url).send().await?;
469 if response.status().is_success() {
470 let text = response.text().await?;
471 let krates: Vec<CrateMetadata> = serde_json::from_str(&text)
472 .with_context(|| format!("Failed to serde_json::from_str {}", json_url))?;
473 info!("fetched set {} from remote index", set_name);
474 return Ok(krates);
475 }
476
477 Err(anyhow::anyhow!(
478 "set {} not found in remote index",
479 set_name
480 ))
481}
482
483async fn index_krate(krate: &crates_io_api::Crate) -> Result<types::Crate> {
484 let temp = temp_dir();
485 let path = temp.join(format!("{}.tar.gz", krate.name));
486 let url = format!(
487 "https://static.crates.io/crates/{name}/{name}-{version}.crate",
488 name = krate.name,
489 version = krate.max_version,
490 );
491
492 let resp = reqwest::get(url).await?;
493 let mut file = OpenOptions::new()
494 .write(true)
495 .create(true)
496 .open(path)
497 .await
498 .context("Could not create the temp tar.gz file")?;
499
500 copy(&mut resp.bytes().await?.as_ref(), &mut file)
501 .await
502 .context("tokio::io::copy failed")?;
503
504 Command::new("tar")
505 .args(["-xf", &format!("{}.tar.gz", krate.name)])
506 .current_dir(&temp)
507 .status()
508 .await
509 .context("Failed to extract tar.gz file")?;
510
511 let unpacked = temp.join(format!("{}-{}", krate.name, krate.max_version));
512 let cargo = Command::new("cargo")
513 .args(["+nightly", "rustdoc"])
514 .env("RUSTDOCFLAGS", "--output-format=json -Z unstable-options")
515 .current_dir(&unpacked)
516 .status()
517 .await
518 .context("Failed to run cargo rustdoc")?;
519 if !cargo.success() {
520 return Err(anyhow::anyhow!(
521 "cargo rustdoc failed for crate {}",
522 krate.name
523 ));
524 }
525 let doc_dir = unpacked.join("target/doc");
527 if !doc_dir.exists() {
528 return Err(anyhow::anyhow!(
529 "doc directory does not exist for crate {}",
530 krate.name
531 ));
532 }
533 let mut doc_dir_reader = fs::read_dir(&doc_dir).await?;
534 let krate_file_path = loop {
535 if let Some(entry) = doc_dir_reader
536 .next_entry()
537 .await
538 .context("Failed to read doc directory")?
539 {
540 let file_name = entry.file_name();
541 let file_name_str = file_name.to_string_lossy();
542 if file_name_str.ends_with(".json") {
543 break entry.path();
544 }
545 } else {
546 return Err(anyhow::anyhow!(
547 "No JSON file found in doc directory for crate {}",
548 krate.name
549 ));
550 }
551 };
552 let mut krate_: types::Crate = serde_json::from_slice(
553 &fs::read(&krate_file_path)
554 .await
555 .context("Failed to read crate JSON file")?,
556 )
557 .with_context(|| format!("Failed to serde_json::from_slice for crate {}", krate.name))?;
558
559 krate_.name = Some(krate.name.clone());
560
561 info!("built crate {} locally", krate.name);
562
563 Ok(krate_)
564}
565
566pub async fn build_crate_locally(metadata: &types::CrateMetadata) -> Result<types::Crate> {
567 let client = AsyncClient::new(
568 "ruggle (akeles@umd.edu)",
569 std::time::Duration::from_millis(1000),
570 )?;
571
572 let krate = client
573 .get_crate(&metadata.name)
574 .await
575 .context(format!("failed to get crate info: {}", &metadata.name))?
576 .crate_data;
577
578 index_krate(&krate).await
579}
580
581pub async fn index_local_crate(
582 index: &mut Index,
583 cargo_manifest_path: &Path,
584) -> Result<Vec<types::Crate>> {
585 let krates_metadata = gather_all_dependencies(cargo_manifest_path)
586 .context("failed to gather all transitive dependencies")?;
587
588 tracing::info!(
589 "gathered {} dependencies from Cargo.toml",
590 krates_metadata.len()
591 );
592 tracing::debug!("dependencies: {:?}", krates_metadata);
593
594 let mut krates: Vec<types::Crate> = Vec::new();
595 for krate_metadata in &krates_metadata {
596 if let Some(krate) = index.crates.get(krate_metadata).cloned() {
597 info!("crate is already indexed: {}", &krate_metadata);
598 krates.push(krate);
599 } else if let Ok(krate) = pull_crate_from_remote_index(krate_metadata).await {
600 krates.push(krate);
601 } else if let Ok(krate) = build_crate_locally(krate_metadata).await {
605 krates.push(krate);
606 } else {
607 error!("failed to index crate: {}", &krate_metadata);
608 }
609 }
610
611 Ok(krates)
612}
613
614#[cfg(test)]
615mod dependency_tests {
616 use super::*;
617 use std::path::PathBuf;
618
619 #[test]
620 fn test_gather_all_dependencies() {
621 let manifest_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
622 .join("..")
623 .join("ruggle-engine")
624 .join("Cargo.toml");
625 let deps = gather_all_dependencies(&manifest_path).unwrap();
626 println!("dependencies: {:#?}", deps);
627 assert!(deps.iter().any(|d| d.name == "ruggle-util"));
628 }
629}
630
631pub fn gather_all_dependencies(cargo_manifest_path: &Path) -> anyhow::Result<Vec<CrateMetadata>> {
632 let metadata = MetadataCommand::new()
633 .manifest_path(cargo_manifest_path)
634 .exec()?;
635
636 let graph = PackageGraph::from_metadata(metadata)?;
637 let mut packages = Vec::new();
638
639 for member in graph.workspace().iter() {
640 for link in member.direct_links() {
641 let pkg = link.to();
642 packages.push(CrateMetadata {
643 name: pkg.name().to_string(),
644 version: pkg.version().to_string(),
645 });
646 }
647 }
648 Ok(packages)
649}
650
651pub fn gather_all_transitive_dependencies(
652 cargo_manifest_path: &Path,
653) -> anyhow::Result<Vec<CrateMetadata>> {
654 let metadata = MetadataCommand::new()
655 .manifest_path(cargo_manifest_path)
656 .exec()?;
657 let graph = PackageGraph::from_metadata(metadata)?;
658 let packages = graph
659 .packages()
660 .map(|pkg| CrateMetadata {
661 name: pkg.name().to_string(),
662 version: pkg.version().to_string(),
663 })
664 .collect();
665 Ok(packages)
666}