licensebat_rust/retriever/
docs_rs.rs

1//! [`Retriever`] that uses the [Docs.rs website].
2//!
3//! Here you can find both the trait and the implementation.
4//!
5//! Usually, [`Collectors`](licensebat_core::Collector) are generic over a [`Retriever`] (or several). This comes in handy for mocking the [`Retriever`] in our tests.
6//!
7//! [`Retriever`]: crate::retriever::docs_rs::Retriever
8//! [Docs.rs website]: https://docs.rs/
9
10use super::utils::crates_io_retrieved_dependency;
11use askalono::{Store, TextData};
12use futures::{future::BoxFuture, Future, FutureExt, TryFutureExt};
13use licensebat_core::{Dependency, RetrievedDependency};
14use reqwest::Client;
15use std::{string::String, sync::Arc};
16use thiserror::Error;
17use tracing::instrument;
18
19/// Trait used by the [`DocsRs`] struct to retrieve dependencies.
20pub trait Retriever: Send + Sync + std::fmt::Debug {
21    /// Future that resolves to a [`RetrievedDependency`].
22    /// It cannot fail.
23    type Response: Future<Output = RetrievedDependency> + Send;
24    /// Validates dependency's information from the original source.
25    fn get_dependency(&self, dependency: Dependency) -> Self::Response;
26}
27
28/// [`docs.rs`] [`Retriever`] implementation.
29///
30/// It uses [`reqwest::Client`] to scrap the [`docs.rs`] website and retrieve the metadata of a dependency.
31///
32/// Note that when a crate is published it takes a while for the [`docs.rs`] website to compile it, so it can take a while to retrieve the metadata of recently uploaded crate.
33///
34/// You can provide yourself an instance of [`reqwest::Client`] by using the [`DocsRs::new`] constructor.
35///
36/// If you use [`DocsRs::default`], it will instantiate a new [`reqwest::Client`] under the hood.
37///
38/// [`docs.rs`]: https://docs.rs
39pub struct DocsRs {
40    client: Client,
41    store: Arc<Option<Store>>,
42}
43
44impl DocsRs {
45    /// Creates a new [`Retriever`].
46    /// If you want to reuse a [`reqwest::Client`] pool consider using the [`DocsRs::new`] method.
47    #[must_use]
48    pub const fn new(client: Client, store: Arc<Option<Store>>) -> Self {
49        Self { client, store }
50    }
51}
52
53impl Default for DocsRs {
54    /// Creates a new [`Retriever`] using the given [`reqwest::Client`].
55    /// If you don't want to pass a [`reqwest::Client`] instance, consider using the [`DocsRs::default`] method.
56    fn default() -> Self {
57        Self::new(Client::new(), Arc::new(None))
58    }
59}
60
61impl Clone for DocsRs {
62    fn clone(&self) -> Self {
63        Self {
64            client: self.client.clone(),
65            store: self.store.clone(),
66        }
67    }
68}
69
70impl std::fmt::Debug for DocsRs {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        f.debug_struct("DocsRs")
73            .field("client", &self.client)
74            .field(
75                "store",
76                if self.store.is_some() {
77                    &"Some(Store)"
78                } else {
79                    &"None"
80                },
81            )
82            .finish()
83    }
84}
85
86impl Retriever for DocsRs {
87    type Response = BoxFuture<'static, RetrievedDependency>;
88
89    #[instrument(skip(self), level = "debug")]
90    fn get_dependency(&self, dependency: Dependency) -> Self::Response {
91        let crate_url = docs_rs_url(&dependency.name, &dependency.version);
92        let cargo_toml_url = format!("{crate_url}Cargo.toml");
93
94        let dep_clone = dependency.clone();
95        let client = self.client.clone();
96        let store = self.store.clone();
97
98        async move {
99            let html = client
100                .get(&cargo_toml_url)
101                .header("User-Agent", "licensebat-cli (licensebat.com)")
102                .send()
103                .await?
104                .text()
105                .await?;
106
107            // Pattern to get license information from the Cargo.toml
108            // docs.rs exposes the content of the Cargo.toml as <code></code>.
109            // Unfortunately, the code is mixed with html code that is generated by the docs.rs website.
110            // We will clean it and will get the license information.
111            let license_info = easy_scraper::Pattern::new(
112                r#"<div id="source-code"><pre><code>{{value}}</code></pre></div>"#,
113            )
114            .map(|pattern| pattern.matches(&html))
115            .map(|matches| {
116                matches
117                .into_iter()
118                .map(|m| m.get("value").unwrap().to_string())
119                .collect::<Vec<String>>().join("\n")
120            })
121            .map(|code| {
122                    let text= code
123                    .replace("\n=\n", "=");
124                    // normally, there's only on item but someone could have decided to inform both `license` and `license-file` attributes.
125                    // we will take the first one.
126                    text.lines().find(|l| l.starts_with("license")).map(|l| {
127                        let items = l.split('=').map(|x| x.trim()).collect::<Vec<_>>();
128                        (items[0].to_string(), items[1].replace('\"', ""))
129                    })
130            });
131
132            let retrieved_dependency = match license_info {
133                Ok(license_info) => {
134                    if let Some((key, value)) = license_info {
135                        match key.as_ref() {
136                            "license" => {
137                                 // TODO: SUPPORT FOR MULTIPLE LICS HERE
138                                crates_io_retrieved_dependency(&dependency, Some(vec![value]), None, None, None)
139                            }
140                            "license-file" => {
141                                get_retrieved_dependency_from_license_file(store, crate_url, value, client, &dependency).await
142                            }
143                            // this should never happen!
144                            _ => {
145                                tracing::error!("Unknown license key: {}", key);
146                                crates_io_retrieved_dependency(&dependency, None, Some("Unexpected license key while parsing cargo.toml"), None, None)
147                            }
148                        }
149                    } else {
150                        let user_error = "No information found in Cargo.toml regarding license or license-file.";
151                        tracing::error!(
152                            "{} Crate {} : {}",
153                            user_error,
154                            &dependency.name,
155                            &dependency.version,
156                        );
157                        crates_io_retrieved_dependency(&dependency, None, Some(user_error), None, None)
158                    }
159                }
160                Err(e) => {
161                    tracing::error!(error = ?e, "Error trying to parse docs.rs for crate {} : {}", &dependency.name, &dependency.version);
162                    crates_io_retrieved_dependency(
163                        &dependency,
164                        None,
165                        Some("Error trying to parse docs.rs"), None, None
166                    )
167                }
168            };
169
170            Ok::<_, anyhow::Error>(retrieved_dependency)
171        }.unwrap_or_else(move |e| {
172                let error = e.to_string();
173                crates_io_retrieved_dependency(&dep_clone, None, Some(error.as_str()), None, None)
174            })
175            .boxed()
176    }
177}
178
179/// Returns the base url of the crate's source code in docs.rs
180fn docs_rs_url(dependency_name: &str, dependency_version: &str) -> String {
181    format!("https://docs.rs/crate/{dependency_name}/{dependency_version}/source/")
182}
183
184/// Returns a `RetrievedDependency` by looking into the Docs.rs declared license file.
185/// This function will use `askalono::Store` to determine the kind of license.
186/// Note that in the comments of the `RetrievedDependency` there will be a `Comment` with the % score.
187async fn get_retrieved_dependency_from_license_file(
188    store: Arc<Option<Store>>,
189    crate_url: String,
190    license: String,
191    client: Client,
192    dependency: &Dependency,
193) -> RetrievedDependency {
194    if let Some(store) = store.as_ref() {
195        let license_url = format!("{crate_url}{license}");
196        if let Ok((license, score)) = get_license_from_docs_rs(&client, store, &license_url).await {
197            crates_io_retrieved_dependency(
198                dependency,
199                Some(vec![license.clone()]),
200                None,
201                Some(format!(
202                    "Our score for this license is {:.2}%.",
203                    score * 100.0
204                )),
205                Some(vec![(license, score)]),
206            )
207        } else {
208            crates_io_retrieved_dependency(
209                dependency,
210                None,
211                Some(&format!(
212                    "Not declared in Cargo.toml. Check the url: {license_url}"
213                )),
214                None,
215                None,
216            )
217        }
218    } else {
219        tracing::error!("No askalono store present in Rust docs.rs retriever");
220        crates_io_retrieved_dependency(
221            dependency,
222            None,
223            Some("No askalono store present"),
224            None,
225            None,
226        )
227    }
228}
229
230async fn get_license_from_docs_rs(
231    client: &Client,
232    store: &Store,
233    url: &str,
234) -> Result<(String, f32), anyhow::Error> {
235    let html = client
236        .get(url)
237        .header("User-Agent", "licensebat-cli (licensebat.com)")
238        .send()
239        .await?
240        .text()
241        .await?;
242
243    let pattern = easy_scraper::Pattern::new(
244        r#"<div id="source-code"><pre><code>{{value}}</code></pre></div>"#,
245    )
246    .map_err(Error)?;
247
248    let matches = pattern.matches(&html);
249    if matches.is_empty() {
250        tracing::error!(%url, "Couldn't get original license from docs.rs");
251        Err(Error(String::from("Not found")).into())
252    } else {
253        let license_html = matches[0]["value"].clone();
254        let license = html2text::from_read(license_html.as_bytes(), 3000);
255        let result = store.analyze(&TextData::from(license.as_str()));
256        Ok((result.name.to_string(), result.score))
257    }
258}
259
260#[derive(Error, Debug)]
261#[error("DocRs Error: {0}")]
262struct Error(String);