Skip to main content

ruggle_server/
lib.rs

1use std::{collections::HashMap, env::temp_dir, io::BufReader, path::Path};
2
3use anyhow::{anyhow, Context, Result};
4use crates_io_api::AsyncClient;
5use guppy::{graph::PackageGraph, MetadataCommand};
6use rayon::iter::{IntoParallelRefIterator as _, ParallelIterator as _};
7use ruggle_engine::{
8    build_parent_index,
9    query::parse::parse_query,
10    search::{Hit, Scope},
11    types::{self, Crate, CrateMetadata},
12    Index, Parent,
13};
14use ruggle_util::shake;
15
16use serde::Deserialize as _;
17use std::io::Read;
18use tokio::{fs::OpenOptions, process::Command};
19use tokio::{
20    fs::{self},
21    io::copy,
22};
23use tracing::{debug, error, info, warn};
24
25pub fn perform_search(
26    index: &Index,
27    scopes: &Scopes,
28    query_str: &str,
29    scope_str: &str,
30    limit: Option<usize>,
31    threshold: Option<f32>,
32) -> anyhow::Result<Vec<Hit>> {
33    tracing::info!(
34        "performing search for query `{}` in scope `{}`",
35        query_str,
36        scope_str
37    );
38
39    tracing::debug!("available scopes: {:?}", scopes.sets.keys());
40    tracing::debug!("available crates: {:?}", scopes.krates.keys());
41    let scope = match scope_str.split(':').collect::<Vec<_>>().as_slice() {
42        ["set", set] => scopes
43            .sets
44            .get(*set)
45            .context(format!("set `{}` not found", set))?,
46        ["crate", krate, version] => scopes
47            .krates
48            .get(&CrateMetadata {
49                name: krate.to_string(),
50                version: version.to_string(),
51            })
52            .context(format!("krate `{}:{}` not found", krate, version))?,
53        ["crate", krate] => scopes
54            .krates
55            .get(&CrateMetadata::new(krate.to_string()))
56            .context(format!("krate `{}` not found", krate))?,
57
58        _ => Err(anyhow!("parsing scope `{}` failed", scope_str))?,
59    };
60    debug!(?scope);
61
62    let query = parse_query(query_str)
63        .ok()
64        .context(format!("parsing query `{}` failed", query_str))?
65        .1;
66    debug!(?query);
67
68    let limit = limit.unwrap_or(30);
69    let threshold = threshold.unwrap_or(0.4);
70
71    let hits = index
72        .search(&query, scope.clone(), threshold)
73        .with_context(|| format!("search with query `{:?}` failed", query))?;
74    let hits = hits
75        .into_iter()
76        .inspect(|hit| debug!(?hit.name, link = ?hit.link, similarities = ?hit.similarities(), score = ?hit.similarities().score()))
77        .take(limit)
78        .collect::<Vec<_>>();
79
80    Ok(hits)
81}
82
83pub async fn make_index(index_dir: &Path) -> Result<Index> {
84    let crate_dir = index_dir.join("crate");
85    info!("building index from {}", crate_dir.display());
86
87    // Gather file list, preferring .zst over .json
88    let mut entries = vec![];
89    let mut dir = fs::read_dir(&crate_dir)
90        .await
91        .context("failed to read index files")?;
92    while let Some(entry) = dir
93        .next_entry()
94        .await
95        .context("failed to read index files")?
96    {
97        let path = entry.path();
98        // Skip all raw .json if a .bin version exists
99        if path.extension().and_then(|e| e.to_str()) == Some("json") {
100            let bin_path = path.with_extension("bin");
101            if bin_path.exists() {
102                continue;
103            }
104        }
105        // Only include .json or .bun files
106        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
107            if ext == "json" || ext == "bin" {
108                entries.push(path);
109            }
110        }
111    }
112
113    info!("found {} crate files", entries.len());
114
115    let t_start = std::time::Instant::now();
116
117    // Parallel deserialization of all crates
118    let crates: HashMap<CrateMetadata, _> = entries
119        .par_iter()
120        .filter_map(|path| {
121            // Skip `<krate_name>.parents.bin` files
122            if path
123                .file_name()
124                .and_then(|f| f.to_str())
125                .map(|f| f.ends_with(".parents.bin"))
126                .unwrap_or(false)
127            {
128                return None;
129            }
130            let file = std::fs::File::open(path).ok()?;
131            let mut reader = BufReader::new(file);
132
133            let ext = path.extension().and_then(|e| e.to_str());
134
135            let t0 = std::time::Instant::now();
136            let krate: Result<Crate> = match ext {
137                Some("bin") => {
138                    bincode::decode_from_reader(&mut reader, bincode::config::standard())
139                        .with_context(|| format!("Failed to bincode::decode {}", path.display()))
140                }
141                _ => serde_json::from_reader(&mut reader)
142                    .map_err(|e| {
143                        eprintln!(
144                            "error while serde_json::from_reader({}) => {e:?}",
145                            path.display()
146                        );
147                        e
148                    })
149                    .with_context(|| {
150                        format!("Failed to serde_json::from_reader {}", path.display())
151                    }),
152            };
153            if let Err(ref e) = krate {
154                warn!("deserializing {:?} failed: {}", path.display(), e);
155                return None;
156            }
157            let mut krate = krate.unwrap();
158            let krate_name: String = path.file_stem()?.to_str()?.to_owned();
159            krate.name = Some(krate_name.clone());
160
161            debug!("deserialized {:?} in {:?}", path.display(), t0.elapsed());
162            let krate_metadata = CrateMetadata {
163                name: krate_name,
164                version: krate.crate_version.clone(),
165            };
166            // Rust 1.90 does not support `Path::file_prefix`, use `file_stem` instead
167            Some((krate_metadata, krate))
168        })
169        .collect();
170
171    let parents: HashMap<CrateMetadata, HashMap<types::Id, Parent>> = crates
172        .par_iter()
173        .map(|(krate_name, krate)| {
174            // If `<krate_name>.parents.bin` exists, load it instead of building from scratch
175            let parents_path = crate_dir.join(format!("{}.parents.bin", krate_name));
176            if parents_path.exists() {
177                let file = std::fs::File::open(&parents_path)
178                    .expect("parents index file existence was already checked");
179                let mut reader = BufReader::new(file);
180                let parent_map: HashMap<types::Id, Parent> =
181                    bincode::decode_from_reader(&mut reader, bincode::config::standard())
182                        .expect("decoding parents index from bin failed");
183                return (krate_name.clone(), parent_map);
184            }
185            // Otherwise, build parents index from scratch
186            let parent_map = build_parent_index(krate);
187            // Serialize parents index to `<krate_name>.parents.bin` for future use
188            let mut file =
189                std::fs::File::create(&parents_path).expect("creating parents index file failed");
190            bincode::encode_into_std_write(&parent_map, &mut file, bincode::config::standard())
191                .expect("encoding parents index to bin failed");
192            tracing::debug!("serialized parents index to {:?}", parents_path);
193            (krate_name.clone(), parent_map)
194        })
195        .collect();
196
197    let total_time = t_start.elapsed();
198    info!(
199        "loaded {} crates in {:.2?} (avg {:.1?} each)",
200        crates.len(),
201        total_time,
202        total_time / (crates.len().max(1) as u32)
203    );
204
205    Ok(Index { crates, parents })
206}
207
208fn dir_size(path: &std::path::Path) -> u64 {
209    std::fs::read_dir(path)
210        .unwrap()
211        .filter_map(|e| e.ok())
212        .map(|e| std::fs::metadata(e.path()).map(|m| m.len()).unwrap_or(0))
213        .sum()
214}
215
216pub fn shake_index(index_dir: &Path) -> Result<()> {
217    // Measure index size before shaking
218    let before = dir_size(&index_dir.join("crate"));
219    let result = std::fs::read_dir(format!("{}/crate", index_dir.display()))
220        .context("failed to read index files")?
221        .map(|entry| {
222            let entry = entry?;
223            let json = std::fs::read_to_string(entry.path())
224                .with_context(|| format!("failed to read `{:?}`", entry.file_name()))?;
225            let mut deserializer = serde_json::Deserializer::from_str(&json);
226            deserializer.disable_recursion_limit();
227            let krate = rustdoc_types::Crate::deserialize(&mut deserializer)
228                .with_context(|| format!("failed to deserialize `{:?}`", entry.file_name()))?;
229            let file_name = entry
230                .path()
231                .with_extension("")
232                .file_name()
233                .with_context(|| format!("failed to get file name from `{:?}`", entry.path()))?
234                .to_str()
235                .context("failed to get `&str` from `&OsStr`")?
236                .to_owned();
237            let krate = shake(krate);
238
239            let json = serde_json::to_string(&krate)
240                .with_context(|| format!("failed to serialize crate `{}`", &file_name))?;
241            std::fs::write(
242                format!("{}/crate/{}.json", index_dir.display(), file_name),
243                json,
244            )
245            .with_context(|| format!("failed to write crate `{}`", &file_name))?;
246
247            Ok(())
248        })
249        .collect::<Result<Vec<()>>>();
250    // Measure index size after shaking
251    let after = dir_size(&index_dir.join("crate"));
252    tracing::info!(
253        "index shaken: {:.2} MB → {:.2} MB (−{:.2} MB, {:.1}% smaller)",
254        before as f64 / 1_048_576.0,
255        after as f64 / 1_048_576.0,
256        (before - after) as f64 / 1_048_576.0,
257        (before - after) as f64 / before as f64 * 100.0
258    );
259
260    result.map(|_| ())
261}
262
263pub fn generate_bin_index(index_dir: &Path) -> Result<()> {
264    let _result = std::fs::read_dir(format!("{}/crate", index_dir.display()))
265        .context("failed to read index files")?
266        .map(|entry| {
267            let entry = entry?;
268            if entry.path().extension().and_then(|e| e.to_str()) == Some("bin") {
269                // Skip already generated bin files
270                tracing::debug!(
271                    "skipping already generated bin file {:?}",
272                    entry.file_name()
273                );
274                return Ok(());
275            }
276            let json = std::fs::read_to_string(entry.path())
277                .with_context(|| format!("failed to read `{:?}`", entry.file_name()))?;
278            let mut deserializer = serde_json::Deserializer::from_str(&json);
279            deserializer.disable_recursion_limit();
280            tracing::debug!("generating bin for {:?}", entry.file_name());
281
282            let krate = Crate::deserialize(&mut deserializer);
283
284            let Ok(krate) = krate else {
285                warn!(
286                    "deserializing {:?} failed: {}",
287                    entry.file_name(),
288                    krate.unwrap_err()
289                );
290                return Ok(());
291            };
292
293            let file_name = entry
294                .path()
295                .with_extension("")
296                .file_name()
297                .with_context(|| format!("failed to get file name from `{:?}`", entry.path()))?
298                .to_str()
299                .context("failed to get `&str` from `&OsStr`")?
300                .to_owned();
301
302            let mut file = std::fs::File::create(format!(
303                "{}/crate/{}.bin",
304                index_dir.display(),
305                file_name
306            ))
307            .with_context(|| format!("failed to create bin file for crate `{}`", &file_name))?;
308            bincode::encode_into_std_write(&krate, &mut file, bincode::config::standard())
309                .with_context(|| format!("failed to serialize crate `{}` to bin", &file_name))?;
310
311            Ok(())
312        })
313        .collect::<Result<Vec<()>>>();
314
315    Ok(())
316}
317
318pub struct Scopes {
319    pub sets: HashMap<String, Scope>,
320    pub krates: HashMap<CrateMetadata, Scope>,
321}
322
323pub fn make_sets(index_dir: &Path) -> HashMap<String, Scope> {
324    match std::fs::read_dir(format!("{}/set", index_dir.display())) {
325        Err(e) => {
326            warn!("registering sets skipped: {}", e);
327            HashMap::default()
328        }
329        Ok(entry) => {
330            entry
331                .map(|entry| {
332                    let entry = entry?;
333                    let path = entry.path();
334                    let json = std::fs::read_to_string(&path)
335                        .context(format!("failed to read `{:?}`", path))?;
336                    let set = path.file_stem().unwrap().to_str().unwrap().to_owned(); // SAFETY: files in `ruggle-index` has a name.
337                    let krates = serde_json::from_str::<Vec<CrateMetadata>>(&json)
338                        .context(format!("failed to deserialize set `{}`", &set))?;
339
340                    Ok((set.clone(), Scope::Set(set, krates)))
341                })
342                .filter_map(|res: Result<_, anyhow::Error>| {
343                    if let Err(ref e) = res {
344                        warn!("registering a scope skipped: {}", e)
345                    }
346                    res.ok()
347                })
348                .collect()
349        }
350    }
351}
352
353pub async fn pull_crate_from_docs_rs(metadata: &types::CrateMetadata) -> Result<types::Crate> {
354    info!("checking docs.rs for crate: {}", &metadata.name);
355    let url = format!(
356        "https://docs.rs/crate/{}/{}/json",
357        metadata.name, metadata.version
358    );
359    debug!("docs.rs url for {}: {}", metadata.name, url);
360
361    let client = reqwest::Client::new();
362    let response = client.get(&url).send().await?;
363    debug!("response status: {}", response.status());
364    if response.status().is_success() {
365        debug!("docs.rs url for {}: {}", metadata.name, url);
366        debug!("response: {:?}", response);
367        let zst_encoded_krate = response.bytes().await?;
368        let mut decoder = ruzstd::decoding::StreamingDecoder::new(&zst_encoded_krate[..]).unwrap();
369        let mut json_encoded_krate = Vec::new();
370        decoder
371            .read_to_end(&mut json_encoded_krate)
372            .with_context(|| format!("Failed to create zstd decoder for {}", url))?;
373
374        let mut krate: types::Crate = serde_json::from_slice(&json_encoded_krate)
375            .with_context(|| format!("Failed to serde_json::from_slice {}", url))?;
376        krate.name = Some(metadata.name.clone());
377        info!("fetched crate {} from docs.rs", metadata);
378        return Ok(krate);
379    }
380
381    Err(anyhow::anyhow!("crate {} not found on docs.rs", metadata))
382}
383
384#[cfg(test)]
385mod tests {
386    use super::*;
387
388    #[tokio::test]
389    async fn test_pull_crate_from_docs_rs() {
390        tracing_subscriber::fmt::fmt()
391            .with_max_level(tracing::Level::DEBUG)
392            .with_test_writer()
393            .init();
394        let krate = types::CrateMetadata {
395            name: "serde".into(),
396            version: "latest".into(),
397        };
398        let result = pull_crate_from_docs_rs(&krate).await;
399        assert!(result.is_ok());
400    }
401}
402
403pub async fn pull_crate_from_remote_index(
404    krate_metadata: &types::CrateMetadata,
405) -> Result<types::Crate> {
406    info!("checking remote index for crate: {}", &krate_metadata.name);
407    let bin_url = format!(
408        "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/crate/{}.bin",
409        krate_metadata.name
410    );
411    let json_url = format!(
412        "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/crate/{}.json",
413        // "https://docs.rs/crate/{}/{}/json",
414        krate_metadata.name,
415        // krate_metadata.version // FIXME: Version-specific crates are not supported in the remote index yet
416    );
417
418    let client = reqwest::Client::new();
419
420    // Try to fetch .bin first
421    debug!(".bin url for {}: {}", krate_metadata, bin_url);
422    let response = client.get(&bin_url).send().await?;
423    if response.status().is_success() {
424        let bytes = response.bytes().await?;
425        if let Ok((krate, _)) =
426            bincode::decode_from_slice::<types::Crate, _>(&bytes, bincode::config::standard())
427        {
428            info!("fetched crate {} from remote index (.bin)", krate_metadata);
429            return Ok(krate);
430        }
431    }
432    tracing::debug!(
433        "crate {} not found in remote index (.bin), trying .json",
434        krate_metadata
435    );
436
437    // Fallback to .json
438    debug!(".json url for {}: {}", krate_metadata, json_url);
439    let response = client.get(&json_url).send().await?;
440    if response.status().is_success() {
441        println!("response: {:?}", response);
442        // If it's a
443        let text = response.text().await?;
444        let mut krate: types::Crate = serde_json::from_str(&text)
445            .with_context(|| format!("Failed to serde_json::from_str {}", json_url))?;
446        krate.name = Some(krate_metadata.name.clone());
447        info!(
448            "fetched crate {} from remote index (.json)",
449            krate_metadata.name
450        );
451        return Ok(krate);
452    }
453
454    Err(anyhow::anyhow!(
455        "crate {} not found in remote index",
456        krate_metadata
457    ))
458}
459
460pub async fn pull_set_from_remote_index(set_name: &str) -> Result<Vec<CrateMetadata>> {
461    info!("fetching set {} from remote index", set_name);
462    let json_url = format!(
463        "https://raw.githubusercontent.com/alpaylan/ruggle-index/main/set/{}.json",
464        set_name
465    );
466
467    let client = reqwest::Client::new();
468    let response = client.get(&json_url).send().await?;
469    if response.status().is_success() {
470        let text = response.text().await?;
471        let krates: Vec<CrateMetadata> = serde_json::from_str(&text)
472            .with_context(|| format!("Failed to serde_json::from_str {}", json_url))?;
473        info!("fetched set {} from remote index", set_name);
474        return Ok(krates);
475    }
476
477    Err(anyhow::anyhow!(
478        "set {} not found in remote index",
479        set_name
480    ))
481}
482
483async fn index_krate(krate: &crates_io_api::Crate) -> Result<types::Crate> {
484    let temp = temp_dir();
485    let path = temp.join(format!("{}.tar.gz", krate.name));
486    let url = format!(
487        "https://static.crates.io/crates/{name}/{name}-{version}.crate",
488        name = krate.name,
489        version = krate.max_version,
490    );
491
492    let resp = reqwest::get(url).await?;
493    let mut file = OpenOptions::new()
494        .write(true)
495        .create(true)
496        .open(path)
497        .await
498        .context("Could not create the temp tar.gz file")?;
499
500    copy(&mut resp.bytes().await?.as_ref(), &mut file)
501        .await
502        .context("tokio::io::copy failed")?;
503
504    Command::new("tar")
505        .args(["-xf", &format!("{}.tar.gz", krate.name)])
506        .current_dir(&temp)
507        .status()
508        .await
509        .context("Failed to extract tar.gz file")?;
510
511    let unpacked = temp.join(format!("{}-{}", krate.name, krate.max_version));
512    let cargo = Command::new("cargo")
513        .args(["+nightly", "rustdoc"])
514        .env("RUSTDOCFLAGS", "--output-format=json -Z unstable-options")
515        .current_dir(&unpacked)
516        .status()
517        .await
518        .context("Failed to run cargo rustdoc")?;
519    if !cargo.success() {
520        return Err(anyhow::anyhow!(
521            "cargo rustdoc failed for crate {}",
522            krate.name
523        ));
524    }
525    // check the `target/doc` contents
526    let doc_dir = unpacked.join("target/doc");
527    if !doc_dir.exists() {
528        return Err(anyhow::anyhow!(
529            "doc directory does not exist for crate {}",
530            krate.name
531        ));
532    }
533    let mut doc_dir_reader = fs::read_dir(&doc_dir).await?;
534    let krate_file_path = loop {
535        if let Some(entry) = doc_dir_reader
536            .next_entry()
537            .await
538            .context("Failed to read doc directory")?
539        {
540            let file_name = entry.file_name();
541            let file_name_str = file_name.to_string_lossy();
542            if file_name_str.ends_with(".json") {
543                break entry.path();
544            }
545        } else {
546            return Err(anyhow::anyhow!(
547                "No JSON file found in doc directory for crate {}",
548                krate.name
549            ));
550        }
551    };
552    let mut krate_: types::Crate = serde_json::from_slice(
553        &fs::read(&krate_file_path)
554            .await
555            .context("Failed to read crate JSON file")?,
556    )
557    .with_context(|| format!("Failed to serde_json::from_slice for crate {}", krate.name))?;
558
559    krate_.name = Some(krate.name.clone());
560
561    info!("built crate {} locally", krate.name);
562
563    Ok(krate_)
564}
565
566pub async fn build_crate_locally(metadata: &types::CrateMetadata) -> Result<types::Crate> {
567    let client = AsyncClient::new(
568        "ruggle (akeles@umd.edu)",
569        std::time::Duration::from_millis(1000),
570    )?;
571
572    let krate = client
573        .get_crate(&metadata.name)
574        .await
575        .context(format!("failed to get crate info: {}", &metadata.name))?
576        .crate_data;
577
578    index_krate(&krate).await
579}
580
581pub async fn index_local_crate(
582    index: &mut Index,
583    cargo_manifest_path: &Path,
584) -> Result<Vec<types::Crate>> {
585    let krates_metadata = gather_all_dependencies(cargo_manifest_path)
586        .context("failed to gather all transitive dependencies")?;
587
588    tracing::info!(
589        "gathered {} dependencies from Cargo.toml",
590        krates_metadata.len()
591    );
592    tracing::debug!("dependencies: {:?}", krates_metadata);
593
594    let mut krates: Vec<types::Crate> = Vec::new();
595    for krate_metadata in &krates_metadata {
596        if let Some(krate) = index.crates.get(krate_metadata).cloned() {
597            info!("crate is already indexed: {}", &krate_metadata);
598            krates.push(krate);
599        } else if let Ok(krate) = pull_crate_from_remote_index(krate_metadata).await {
600            krates.push(krate);
601        // FIXME: docs.rs is unreliable sometimes, and we also need to differentiate crates that have a different local version
602        // } else if let Ok(krate) = pull_crate_from_docs_rs(krate_metadata).await {
603        //     krates.push(krate);
604        } else if let Ok(krate) = build_crate_locally(krate_metadata).await {
605            krates.push(krate);
606        } else {
607            error!("failed to index crate: {}", &krate_metadata);
608        }
609    }
610
611    Ok(krates)
612}
613
614#[cfg(test)]
615mod dependency_tests {
616    use super::*;
617    use std::path::PathBuf;
618
619    #[test]
620    fn test_gather_all_dependencies() {
621        let manifest_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
622            .join("..")
623            .join("ruggle-engine")
624            .join("Cargo.toml");
625        let deps = gather_all_dependencies(&manifest_path).unwrap();
626        println!("dependencies: {:#?}", deps);
627        assert!(deps.iter().any(|d| d.name == "ruggle-util"));
628    }
629}
630
631pub fn gather_all_dependencies(cargo_manifest_path: &Path) -> anyhow::Result<Vec<CrateMetadata>> {
632    let metadata = MetadataCommand::new()
633        .manifest_path(cargo_manifest_path)
634        .exec()?;
635
636    let graph = PackageGraph::from_metadata(metadata)?;
637    let mut packages = Vec::new();
638
639    for member in graph.workspace().iter() {
640        for link in member.direct_links() {
641            let pkg = link.to();
642            packages.push(CrateMetadata {
643                name: pkg.name().to_string(),
644                version: pkg.version().to_string(),
645            });
646        }
647    }
648    Ok(packages)
649}
650
651pub fn gather_all_transitive_dependencies(
652    cargo_manifest_path: &Path,
653) -> anyhow::Result<Vec<CrateMetadata>> {
654    let metadata = MetadataCommand::new()
655        .manifest_path(cargo_manifest_path)
656        .exec()?;
657    let graph = PackageGraph::from_metadata(metadata)?;
658    let packages = graph
659        .packages()
660        .map(|pkg| CrateMetadata {
661            name: pkg.name().to_string(),
662            version: pkg.version().to_string(),
663        })
664        .collect();
665    Ok(packages)
666}