Skip to main content

uv_metadata/
lib.rs

1//! Read metadata from wheels and source distributions.
2//!
3//! This module reads all fields exhaustively. The fields are defined in the [Core metadata
4//! specification](https://packaging.python.org/en/latest/specifications/core-metadata/).
5
6use futures::executor::block_on;
7use futures::io::AllowStdIo;
8use std::io;
9use std::path::Path;
10use thiserror::Error;
11use tokio::io::AsyncReadExt;
12use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt};
13use uv_distribution_filename::WheelFilename;
14use uv_normalize::{DistInfoName, InvalidNameError};
15use uv_pypi_types::ResolutionMetadata;
16
17/// The caller is responsible for attaching the path or url we failed to read.
18#[derive(Debug, Error)]
19pub enum Error {
20    #[error("Failed to read `dist-info` metadata from built wheel")]
21    DistInfo,
22    #[error("No .dist-info directory found")]
23    MissingDistInfo,
24    #[error("Multiple .dist-info directories found: {0}")]
25    MultipleDistInfo(String),
26    #[error(
27        "The .dist-info directory does not consist of the normalized package name and version: `{0}`"
28    )]
29    MissingDistInfoSegments(String),
30    #[error("The .dist-info directory {0} does not start with the normalized package name: {1}")]
31    MissingDistInfoPackageName(String, String),
32    #[error("The .dist-info directory name contains invalid characters")]
33    InvalidName(#[from] InvalidNameError),
34    #[error("The metadata at {0} is invalid")]
35    InvalidMetadata(String, Box<uv_pypi_types::MetadataError>),
36    #[error("Bad CRC (got {computed:08x}, expected {expected:08x}) for file: {path}")]
37    BadCrc32 {
38        path: String,
39        computed: u32,
40        expected: u32,
41    },
42    #[error("Failed to read from zip file")]
43    AsyncZip(#[from] async_zip::error::ZipError),
44    // No `#[from]` to enforce manual review of `io::Error` sources.
45    #[error(transparent)]
46    Io(io::Error),
47}
48
49/// Find the `.dist-info` directory in a zipped wheel.
50///
51/// Returns the dist info dir prefix without the `.dist-info` extension.
52///
53/// Reference implementation: <https://github.com/pypa/pip/blob/36823099a9cdd83261fdbc8c1d2a24fa2eea72ca/src/pip/_internal/utils/wheel.py#L38>
54pub fn find_archive_dist_info<'a, T: Copy>(
55    filename: &WheelFilename,
56    files: impl Iterator<Item = (T, &'a str)>,
57) -> Result<(T, &'a str), Error> {
58    let metadatas: Vec<_> = files
59        .filter_map(|(payload, path)| {
60            let (dist_info_dir, file) = path.split_once('/')?;
61            if file != "METADATA" {
62                return None;
63            }
64            let dist_info_prefix = dist_info_dir.strip_suffix(".dist-info")?;
65            Some((payload, dist_info_prefix))
66        })
67        .collect();
68
69    // Like `pip`, assert that there is exactly one `.dist-info` directory.
70    let (payload, dist_info_prefix) = match metadatas[..] {
71        [] => {
72            return Err(Error::MissingDistInfo);
73        }
74        [(payload, path)] => (payload, path),
75        _ => {
76            return Err(Error::MultipleDistInfo(
77                metadatas
78                    .into_iter()
79                    .map(|(_, dist_info_dir)| dist_info_dir.to_string())
80                    .collect::<Vec<_>>()
81                    .join(", "),
82            ));
83        }
84    };
85
86    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
87    // package name.
88    let normalized_prefix = DistInfoName::new(dist_info_prefix);
89    if !normalized_prefix
90        .as_ref()
91        .starts_with(filename.name.as_str())
92    {
93        return Err(Error::MissingDistInfoPackageName(
94            dist_info_prefix.to_string(),
95            filename.name.to_string(),
96        ));
97    }
98
99    Ok((payload, dist_info_prefix))
100}
101
102/// Returns `true` if the file is a `METADATA` file in a `.dist-info` directory that matches the
103/// wheel filename.
104pub fn is_metadata_entry(path: &str, filename: &WheelFilename) -> Result<bool, Error> {
105    let Some((dist_info_dir, file)) = path.split_once('/') else {
106        return Ok(false);
107    };
108    if file != "METADATA" {
109        return Ok(false);
110    }
111    let Some(dist_info_prefix) = dist_info_dir.strip_suffix(".dist-info") else {
112        return Ok(false);
113    };
114
115    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
116    // package name.
117    let normalized_prefix = DistInfoName::new(dist_info_prefix);
118    if !normalized_prefix
119        .as_ref()
120        .starts_with(filename.name.as_str())
121    {
122        return Err(Error::MissingDistInfoPackageName(
123            dist_info_prefix.to_string(),
124            filename.name.to_string(),
125        ));
126    }
127
128    Ok(true)
129}
130
131/// Given an archive, read the `METADATA` from the `.dist-info` directory.
132pub fn read_archive_metadata(
133    filename: &WheelFilename,
134    reader: impl std::io::BufRead + std::io::Seek + Unpin,
135) -> Result<Vec<u8>, Error> {
136    block_on(async {
137        let mut zip_reader =
138            async_zip::base::read::seek::ZipFileReader::new(AllowStdIo::new(reader)).await?;
139
140        let (metadata_index, _dist_info_prefix) = find_archive_dist_info(
141            filename,
142            zip_reader
143                .file()
144                .entries()
145                .iter()
146                .enumerate()
147                .filter_map(|(index, entry)| Some((index, entry.filename().as_str().ok()?))),
148        )?;
149
150        let mut buffer = Vec::new();
151        zip_reader
152            .reader_with_entry(metadata_index)
153            .await?
154            .read_to_end_checked(&mut buffer)
155            .await?;
156
157        Ok(buffer)
158    })
159}
160
161/// Find the `.dist-info` directory in an unzipped wheel.
162///
163/// See: <https://github.com/PyO3/python-pkginfo-rs>
164pub fn find_flat_dist_info(
165    filename: &WheelFilename,
166    path: impl AsRef<Path>,
167) -> Result<String, Error> {
168    // Iterate over `path` to find the `.dist-info` directory. It should be at the top-level.
169    let Some(dist_info_prefix) = fs_err::read_dir(path.as_ref())
170        .map_err(Error::Io)?
171        .find_map(|entry| {
172            let entry = entry.ok()?;
173            let file_type = entry.file_type().ok()?;
174            if file_type.is_dir() {
175                let path = entry.path();
176
177                let extension = path.extension()?;
178                if extension != "dist-info" {
179                    return None;
180                }
181
182                let dist_info_prefix = path.file_stem()?.to_str()?;
183                Some(dist_info_prefix.to_string())
184            } else {
185                None
186            }
187        })
188    else {
189        return Err(Error::MissingDistInfo);
190    };
191
192    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
193    // package name.
194    let normalized_prefix = DistInfoName::new(&dist_info_prefix);
195    if !normalized_prefix
196        .as_ref()
197        .starts_with(filename.name.as_str())
198    {
199        return Err(Error::MissingDistInfoPackageName(
200            dist_info_prefix,
201            filename.name.to_string(),
202        ));
203    }
204
205    Ok(dist_info_prefix)
206}
207
208/// Read the wheel `METADATA` metadata from a `.dist-info` directory.
209pub fn read_dist_info_metadata(
210    dist_info_prefix: &str,
211    wheel: impl AsRef<Path>,
212) -> Result<Vec<u8>, Error> {
213    let metadata_file = wheel
214        .as_ref()
215        .join(format!("{dist_info_prefix}.dist-info/METADATA"));
216    fs_err::read(metadata_file).map_err(Error::Io)
217}
218
219/// Read a wheel's `METADATA` file from a zip file.
220pub async fn read_metadata_async_seek(
221    filename: &WheelFilename,
222    reader: impl tokio::io::AsyncRead + tokio::io::AsyncSeek + Unpin,
223) -> Result<Vec<u8>, Error> {
224    let reader = futures::io::BufReader::new(reader.compat());
225    let mut zip_reader = async_zip::base::read::seek::ZipFileReader::new(reader).await?;
226
227    let (metadata_idx, _dist_info_prefix) = find_archive_dist_info(
228        filename,
229        zip_reader
230            .file()
231            .entries()
232            .iter()
233            .enumerate()
234            .filter_map(|(index, entry)| Some((index, entry.filename().as_str().ok()?))),
235    )?;
236
237    // Read the contents of the `METADATA` file.
238    let mut contents = Vec::new();
239    zip_reader
240        .reader_with_entry(metadata_idx)
241        .await?
242        .read_to_end_checked(&mut contents)
243        .await?;
244
245    Ok(contents)
246}
247
248/// Like [`read_metadata_async_seek`], but doesn't use seek.
249pub async fn read_metadata_async_stream<R: futures::AsyncRead + Unpin>(
250    filename: &WheelFilename,
251    debug_path: &str,
252    reader: R,
253) -> Result<ResolutionMetadata, Error> {
254    let reader = futures::io::BufReader::with_capacity(128 * 1024, reader);
255    let mut zip = async_zip::base::read::stream::ZipFileReader::new(reader);
256
257    while let Some(mut entry) = zip.next_with_entry().await? {
258        // Find the `METADATA` entry.
259        let path = entry.reader().entry().filename().as_str()?.to_owned();
260
261        if is_metadata_entry(&path, filename)? {
262            let mut reader = entry.reader_mut().compat();
263            let mut contents = Vec::new();
264            reader.read_to_end(&mut contents).await.map_err(Error::Io)?;
265
266            // Validate the CRC of any file we unpack
267            // (It would be nice if async_zip made it harder to Not do this...)
268            let reader = reader.into_inner();
269            let computed = reader.compute_hash();
270            let expected = reader.entry().crc32();
271            if computed != expected {
272                let error = Error::BadCrc32 {
273                    path,
274                    computed,
275                    expected,
276                };
277                // There are some cases where we fail to get a proper CRC.
278                // This is probably connected to out-of-line data descriptors
279                // which are problematic to access in a streaming context.
280                // In those cases the CRC seems to reliably be stubbed inline as 0,
281                // so we downgrade this to a (hidden-by-default) warning.
282                if expected == 0 {
283                    tracing::warn!("presumed missing CRC: {error}");
284                } else {
285                    return Err(error);
286                }
287            }
288
289            let metadata = ResolutionMetadata::parse_metadata(&contents)
290                .map_err(|err| Error::InvalidMetadata(debug_path.to_string(), Box::new(err)))?;
291            return Ok(metadata);
292        }
293
294        // Close current file to get access to the next one. See docs:
295        // https://docs.rs/async_zip/0.0.16/async_zip/base/read/stream/
296        (.., zip) = entry.skip().await?;
297    }
298
299    Err(Error::MissingDistInfo)
300}
301
302/// Read the [`ResolutionMetadata`] from an unzipped wheel.
303pub fn read_flat_wheel_metadata(
304    filename: &WheelFilename,
305    wheel: impl AsRef<Path>,
306) -> Result<ResolutionMetadata, Error> {
307    let dist_info_prefix = find_flat_dist_info(filename, &wheel)?;
308    let metadata = read_dist_info_metadata(&dist_info_prefix, &wheel)?;
309    ResolutionMetadata::parse_metadata(&metadata).map_err(|err| {
310        Error::InvalidMetadata(
311            format!("{dist_info_prefix}.dist-info/METADATA"),
312            Box::new(err),
313        )
314    })
315}
316
317#[cfg(test)]
318mod test {
319    use super::find_archive_dist_info;
320    use std::str::FromStr;
321    use uv_distribution_filename::WheelFilename;
322
323    #[test]
324    fn test_dot_in_name() {
325        let files = [
326            "mastodon/Mastodon.py",
327            "mastodon/__init__.py",
328            "mastodon/streaming.py",
329            "Mastodon.py-1.5.1.dist-info/DESCRIPTION.rst",
330            "Mastodon.py-1.5.1.dist-info/metadata.json",
331            "Mastodon.py-1.5.1.dist-info/top_level.txt",
332            "Mastodon.py-1.5.1.dist-info/WHEEL",
333            "Mastodon.py-1.5.1.dist-info/METADATA",
334            "Mastodon.py-1.5.1.dist-info/RECORD",
335        ];
336        let filename = WheelFilename::from_str("Mastodon.py-1.5.1-py2.py3-none-any.whl").unwrap();
337        let (_, dist_info_prefix) =
338            find_archive_dist_info(&filename, files.into_iter().map(|file| (file, file))).unwrap();
339        assert_eq!(dist_info_prefix, "Mastodon.py-1.5.1");
340    }
341}