Skip to main content

uv_metadata/
lib.rs

1//! Read metadata from wheels and source distributions.
2//!
3//! This module reads all fields exhaustively. The fields are defined in the [Core metadata
4//! specification](https://packaging.python.org/en/latest/specifications/core-metadata/).
5
6use futures::executor::block_on;
7use futures::io::AllowStdIo;
8use std::io;
9use std::path::Path;
10use thiserror::Error;
11use tokio::io::AsyncReadExt;
12use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt};
13use uv_distribution_filename::WheelFilename;
14use uv_normalize::{DistInfoName, InvalidNameError};
15use uv_pypi_types::ResolutionMetadata;
16
17/// The caller is responsible for attaching the path or url we failed to read.
18#[derive(Debug, Error)]
19pub enum Error {
20    #[error("Failed to read `dist-info` metadata from built wheel")]
21    DistInfo,
22    #[error("No .dist-info directory found")]
23    MissingDistInfo,
24    #[error("Multiple .dist-info directories found: {0}")]
25    MultipleDistInfo(String),
26    #[error(
27        "The .dist-info directory does not consist of the normalized package name and version: `{0}`"
28    )]
29    MissingDistInfoSegments(String),
30    #[error("The .dist-info directory {0} does not start with the normalized package name: {1}")]
31    MissingDistInfoPackageName(String, String),
32    #[error("The .dist-info directory name contains invalid characters")]
33    InvalidName(#[from] InvalidNameError),
34    #[error("The metadata at {0} is invalid")]
35    InvalidMetadata(String, Box<uv_pypi_types::MetadataError>),
36    #[error("Bad CRC (got {computed:08x}, expected {expected:08x}) for file: {path}")]
37    BadCrc32 {
38        path: String,
39        computed: u32,
40        expected: u32,
41    },
42    #[error("Failed to read from zip file")]
43    AsyncZip(#[from] async_zip::error::ZipError),
44    // No `#[from]` to enforce manual review of `io::Error` sources.
45    #[error(transparent)]
46    Io(io::Error),
47}
48
49/// Find the `.dist-info` directory in a zipped wheel.
50///
51/// Returns the dist info dir prefix without the `.dist-info` extension.
52///
53/// Reference implementation: <https://github.com/pypa/pip/blob/36823099a9cdd83261fdbc8c1d2a24fa2eea72ca/src/pip/_internal/utils/wheel.py#L38>
54pub fn find_archive_dist_info<'a, T: Copy>(
55    filename: &WheelFilename,
56    files: impl Iterator<Item = (T, &'a str)>,
57) -> Result<(T, &'a str), Error> {
58    let metadatas: Vec<_> = files
59        .filter_map(|(payload, path)| {
60            let (dist_info_dir, file) = path.split_once('/')?;
61            if file != "METADATA" {
62                return None;
63            }
64            let dist_info_prefix = dist_info_dir.strip_suffix(".dist-info")?;
65            Some((payload, dist_info_prefix))
66        })
67        .collect();
68
69    // Like `pip`, assert that there is exactly one `.dist-info` directory.
70    let (payload, dist_info_prefix) = match metadatas[..] {
71        [] => {
72            return Err(Error::MissingDistInfo);
73        }
74        [(payload, path)] => (payload, path),
75        _ => {
76            return Err(Error::MultipleDistInfo(
77                metadatas
78                    .into_iter()
79                    .map(|(_, dist_info_dir)| dist_info_dir.to_string())
80                    .collect::<Vec<_>>()
81                    .join(", "),
82            ));
83        }
84    };
85
86    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
87    // package name.
88    let normalized_prefix = DistInfoName::new(dist_info_prefix);
89    if !normalized_prefix
90        .as_ref()
91        .starts_with(filename.name.as_str())
92    {
93        return Err(Error::MissingDistInfoPackageName(
94            dist_info_prefix.to_string(),
95            filename.name.to_string(),
96        ));
97    }
98
99    Ok((payload, dist_info_prefix))
100}
101
102/// Returns `true` if the file is a `METADATA` file in a `.dist-info` directory that matches the
103/// wheel filename.
104fn is_metadata_entry(path: &str, filename: &WheelFilename) -> Result<bool, Error> {
105    let Some((dist_info_dir, file)) = path.split_once('/') else {
106        return Ok(false);
107    };
108    if file != "METADATA" {
109        return Ok(false);
110    }
111    let Some(dist_info_prefix) = dist_info_dir.strip_suffix(".dist-info") else {
112        return Ok(false);
113    };
114
115    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
116    // package name.
117    let normalized_prefix = DistInfoName::new(dist_info_prefix);
118    if !normalized_prefix
119        .as_ref()
120        .starts_with(filename.name.as_str())
121    {
122        return Err(Error::MissingDistInfoPackageName(
123            dist_info_prefix.to_string(),
124            filename.name.to_string(),
125        ));
126    }
127
128    Ok(true)
129}
130
131/// Given an archive, read the `METADATA` from the `.dist-info` directory.
132pub fn read_archive_metadata(
133    filename: &WheelFilename,
134    reader: impl std::io::BufRead + std::io::Seek + Unpin,
135) -> Result<Vec<u8>, Error> {
136    block_on(async {
137        let mut zip_reader =
138            async_zip::base::read::seek::ZipFileReader::new(AllowStdIo::new(reader)).await?;
139
140        let (metadata_index, _dist_info_prefix) = find_archive_dist_info(
141            filename,
142            zip_reader
143                .file()
144                .entries()
145                .iter()
146                .enumerate()
147                .filter_map(|(index, entry)| Some((index, entry.filename().as_str().ok()?))),
148        )?;
149
150        let mut buffer = Vec::new();
151        zip_reader
152            .reader_with_entry(metadata_index)
153            .await?
154            .read_to_end_checked(&mut buffer)
155            .await?;
156
157        Ok(buffer)
158    })
159}
160
161/// Find the `.dist-info` directory in an unzipped wheel.
162///
163/// See: <https://github.com/PyO3/python-pkginfo-rs>
164fn find_flat_dist_info(filename: &WheelFilename, path: impl AsRef<Path>) -> Result<String, Error> {
165    // Iterate over `path` to find the `.dist-info` directory. It should be at the top-level.
166    let Some(dist_info_prefix) = fs_err::read_dir(path.as_ref())
167        .map_err(Error::Io)?
168        .find_map(|entry| {
169            let entry = entry.ok()?;
170            let file_type = entry.file_type().ok()?;
171            if file_type.is_dir() {
172                let path = entry.path();
173
174                let extension = path.extension()?;
175                if extension != "dist-info" {
176                    return None;
177                }
178
179                let dist_info_prefix = path.file_stem()?.to_str()?;
180                Some(dist_info_prefix.to_string())
181            } else {
182                None
183            }
184        })
185    else {
186        return Err(Error::MissingDistInfo);
187    };
188
189    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
190    // package name.
191    let normalized_prefix = DistInfoName::new(&dist_info_prefix);
192    if !normalized_prefix
193        .as_ref()
194        .starts_with(filename.name.as_str())
195    {
196        return Err(Error::MissingDistInfoPackageName(
197            dist_info_prefix,
198            filename.name.to_string(),
199        ));
200    }
201
202    Ok(dist_info_prefix)
203}
204
205/// Read the wheel `METADATA` metadata from a `.dist-info` directory.
206fn read_dist_info_metadata(
207    dist_info_prefix: &str,
208    wheel: impl AsRef<Path>,
209) -> Result<Vec<u8>, Error> {
210    let metadata_file = wheel
211        .as_ref()
212        .join(format!("{dist_info_prefix}.dist-info/METADATA"));
213    fs_err::read(metadata_file).map_err(Error::Io)
214}
215
216/// Read a wheel's `METADATA` file from a zip file.
217pub async fn read_metadata_async_seek(
218    filename: &WheelFilename,
219    reader: impl tokio::io::AsyncRead + tokio::io::AsyncSeek + Unpin,
220) -> Result<Vec<u8>, Error> {
221    let reader = futures::io::BufReader::new(reader.compat());
222    let mut zip_reader = async_zip::base::read::seek::ZipFileReader::new(reader).await?;
223
224    let (metadata_idx, _dist_info_prefix) = find_archive_dist_info(
225        filename,
226        zip_reader
227            .file()
228            .entries()
229            .iter()
230            .enumerate()
231            .filter_map(|(index, entry)| Some((index, entry.filename().as_str().ok()?))),
232    )?;
233
234    // Read the contents of the `METADATA` file.
235    let mut contents = Vec::new();
236    zip_reader
237        .reader_with_entry(metadata_idx)
238        .await?
239        .read_to_end_checked(&mut contents)
240        .await?;
241
242    Ok(contents)
243}
244
245/// Like [`read_metadata_async_seek`], but doesn't use seek.
246pub async fn read_metadata_async_stream<R: futures::AsyncRead + Unpin>(
247    filename: &WheelFilename,
248    debug_path: &str,
249    reader: R,
250) -> Result<ResolutionMetadata, Error> {
251    let reader = futures::io::BufReader::with_capacity(128 * 1024, reader);
252    let mut zip = async_zip::base::read::stream::ZipFileReader::new(reader);
253
254    while let Some(mut entry) = zip.next_with_entry().await? {
255        // Find the `METADATA` entry.
256        let path = entry.reader().entry().filename().as_str()?.to_owned();
257
258        if is_metadata_entry(&path, filename)? {
259            let mut reader = entry.reader_mut().compat();
260            let mut contents = Vec::new();
261            reader.read_to_end(&mut contents).await.map_err(Error::Io)?;
262
263            // Validate the CRC of any file we unpack
264            // (It would be nice if async_zip made it harder to Not do this...)
265            let reader = reader.into_inner();
266            let computed = reader.compute_hash();
267            let expected = reader.entry().crc32();
268            if computed != expected {
269                let error = Error::BadCrc32 {
270                    path,
271                    computed,
272                    expected,
273                };
274                // There are some cases where we fail to get a proper CRC.
275                // This is probably connected to out-of-line data descriptors
276                // which are problematic to access in a streaming context.
277                // In those cases the CRC seems to reliably be stubbed inline as 0,
278                // so we downgrade this to a (hidden-by-default) warning.
279                if expected == 0 {
280                    tracing::warn!("presumed missing CRC: {error}");
281                } else {
282                    return Err(error);
283                }
284            }
285
286            let metadata = ResolutionMetadata::parse_metadata(&contents)
287                .map_err(|err| Error::InvalidMetadata(debug_path.to_string(), Box::new(err)))?;
288            return Ok(metadata);
289        }
290
291        // Close current file to get access to the next one. See docs:
292        // https://docs.rs/async_zip/0.0.16/async_zip/base/read/stream/
293        (.., zip) = entry.skip().await?;
294    }
295
296    Err(Error::MissingDistInfo)
297}
298
299/// Read the [`ResolutionMetadata`] from an unzipped wheel.
300pub fn read_flat_wheel_metadata(
301    filename: &WheelFilename,
302    wheel: impl AsRef<Path>,
303) -> Result<ResolutionMetadata, Error> {
304    let dist_info_prefix = find_flat_dist_info(filename, &wheel)?;
305    let metadata = read_dist_info_metadata(&dist_info_prefix, &wheel)?;
306    ResolutionMetadata::parse_metadata(&metadata).map_err(|err| {
307        Error::InvalidMetadata(
308            format!("{dist_info_prefix}.dist-info/METADATA"),
309            Box::new(err),
310        )
311    })
312}
313
314#[cfg(test)]
315mod test {
316    use super::find_archive_dist_info;
317    use std::str::FromStr;
318    use uv_distribution_filename::WheelFilename;
319
320    #[test]
321    fn test_dot_in_name() {
322        let files = [
323            "mastodon/Mastodon.py",
324            "mastodon/__init__.py",
325            "mastodon/streaming.py",
326            "Mastodon.py-1.5.1.dist-info/DESCRIPTION.rst",
327            "Mastodon.py-1.5.1.dist-info/metadata.json",
328            "Mastodon.py-1.5.1.dist-info/top_level.txt",
329            "Mastodon.py-1.5.1.dist-info/WHEEL",
330            "Mastodon.py-1.5.1.dist-info/METADATA",
331            "Mastodon.py-1.5.1.dist-info/RECORD",
332        ];
333        let filename = WheelFilename::from_str("Mastodon.py-1.5.1-py2.py3-none-any.whl").unwrap();
334        let (_, dist_info_prefix) =
335            find_archive_dist_info(&filename, files.into_iter().map(|file| (file, file))).unwrap();
336        assert_eq!(dist_info_prefix, "Mastodon.py-1.5.1");
337    }
338}