uv_metadata/
lib.rs

1//! Read metadata from wheels and source distributions.
2//!
3//! This module reads all fields exhaustively. The fields are defined in the [Core metadata
4//! specification](https://packaging.python.org/en/latest/specifications/core-metadata/).
5
6use std::io;
7use std::io::{Read, Seek};
8use std::path::Path;
9use thiserror::Error;
10use tokio::io::AsyncReadExt;
11use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt};
12use uv_distribution_filename::WheelFilename;
13use uv_normalize::{DistInfoName, InvalidNameError};
14use uv_pypi_types::ResolutionMetadata;
15use zip::ZipArchive;
16
17/// The caller is responsible for attaching the path or url we failed to read.
18#[derive(Debug, Error)]
19pub enum Error {
20    #[error("Failed to read `dist-info` metadata from built wheel")]
21    DistInfo,
22    #[error("No .dist-info directory found")]
23    MissingDistInfo,
24    #[error("Multiple .dist-info directories found: {0}")]
25    MultipleDistInfo(String),
26    #[error(
27        "The .dist-info directory does not consist of the normalized package name and version: `{0}`"
28    )]
29    MissingDistInfoSegments(String),
30    #[error("The .dist-info directory {0} does not start with the normalized package name: {1}")]
31    MissingDistInfoPackageName(String, String),
32    #[error("The .dist-info directory name contains invalid characters")]
33    InvalidName(#[from] InvalidNameError),
34    #[error("The metadata at {0} is invalid")]
35    InvalidMetadata(String, Box<uv_pypi_types::MetadataError>),
36    #[error("Bad CRC (got {computed:08x}, expected {expected:08x}) for file: {path}")]
37    BadCrc32 {
38        path: String,
39        computed: u32,
40        expected: u32,
41    },
42    #[error("Failed to read from zip file")]
43    Zip(#[from] zip::result::ZipError),
44    #[error("Failed to read from zip file")]
45    AsyncZip(#[from] async_zip::error::ZipError),
46    // No `#[from]` to enforce manual review of `io::Error` sources.
47    #[error(transparent)]
48    Io(io::Error),
49}
50
51/// Find the `.dist-info` directory in a zipped wheel.
52///
53/// Returns the dist info dir prefix without the `.dist-info` extension.
54///
55/// Reference implementation: <https://github.com/pypa/pip/blob/36823099a9cdd83261fdbc8c1d2a24fa2eea72ca/src/pip/_internal/utils/wheel.py#L38>
56pub fn find_archive_dist_info<'a, T: Copy>(
57    filename: &WheelFilename,
58    files: impl Iterator<Item = (T, &'a str)>,
59) -> Result<(T, &'a str), Error> {
60    let metadatas: Vec<_> = files
61        .filter_map(|(payload, path)| {
62            let (dist_info_dir, file) = path.split_once('/')?;
63            if file != "METADATA" {
64                return None;
65            }
66            let dist_info_prefix = dist_info_dir.strip_suffix(".dist-info")?;
67            Some((payload, dist_info_prefix))
68        })
69        .collect();
70
71    // Like `pip`, assert that there is exactly one `.dist-info` directory.
72    let (payload, dist_info_prefix) = match metadatas[..] {
73        [] => {
74            return Err(Error::MissingDistInfo);
75        }
76        [(payload, path)] => (payload, path),
77        _ => {
78            return Err(Error::MultipleDistInfo(
79                metadatas
80                    .into_iter()
81                    .map(|(_, dist_info_dir)| dist_info_dir.to_string())
82                    .collect::<Vec<_>>()
83                    .join(", "),
84            ));
85        }
86    };
87
88    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
89    // package name.
90    let normalized_prefix = DistInfoName::new(dist_info_prefix);
91    if !normalized_prefix
92        .as_ref()
93        .starts_with(filename.name.as_str())
94    {
95        return Err(Error::MissingDistInfoPackageName(
96            dist_info_prefix.to_string(),
97            filename.name.to_string(),
98        ));
99    }
100
101    Ok((payload, dist_info_prefix))
102}
103
104/// Returns `true` if the file is a `METADATA` file in a `.dist-info` directory that matches the
105/// wheel filename.
106pub fn is_metadata_entry(path: &str, filename: &WheelFilename) -> Result<bool, Error> {
107    let Some((dist_info_dir, file)) = path.split_once('/') else {
108        return Ok(false);
109    };
110    if file != "METADATA" {
111        return Ok(false);
112    }
113    let Some(dist_info_prefix) = dist_info_dir.strip_suffix(".dist-info") else {
114        return Ok(false);
115    };
116
117    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
118    // package name.
119    let normalized_prefix = DistInfoName::new(dist_info_prefix);
120    if !normalized_prefix
121        .as_ref()
122        .starts_with(filename.name.as_str())
123    {
124        return Err(Error::MissingDistInfoPackageName(
125            dist_info_prefix.to_string(),
126            filename.name.to_string(),
127        ));
128    }
129
130    Ok(true)
131}
132
133/// Given an archive, read the `METADATA` from the `.dist-info` directory.
134pub fn read_archive_metadata(
135    filename: &WheelFilename,
136    archive: &mut ZipArchive<impl Read + Seek + Sized>,
137) -> Result<Vec<u8>, Error> {
138    let dist_info_prefix =
139        find_archive_dist_info(filename, archive.file_names().map(|name| (name, name)))?.1;
140
141    let mut file = archive.by_name(&format!("{dist_info_prefix}.dist-info/METADATA"))?;
142
143    #[allow(clippy::cast_possible_truncation)]
144    let mut buffer = Vec::with_capacity(file.size() as usize);
145    file.read_to_end(&mut buffer).map_err(Error::Io)?;
146
147    Ok(buffer)
148}
149
150/// Find the `.dist-info` directory in an unzipped wheel.
151///
152/// See: <https://github.com/PyO3/python-pkginfo-rs>
153pub fn find_flat_dist_info(
154    filename: &WheelFilename,
155    path: impl AsRef<Path>,
156) -> Result<String, Error> {
157    // Iterate over `path` to find the `.dist-info` directory. It should be at the top-level.
158    let Some(dist_info_prefix) = fs_err::read_dir(path.as_ref())
159        .map_err(Error::Io)?
160        .find_map(|entry| {
161            let entry = entry.ok()?;
162            let file_type = entry.file_type().ok()?;
163            if file_type.is_dir() {
164                let path = entry.path();
165
166                let extension = path.extension()?;
167                if extension != "dist-info" {
168                    return None;
169                }
170
171                let dist_info_prefix = path.file_stem()?.to_str()?;
172                Some(dist_info_prefix.to_string())
173            } else {
174                None
175            }
176        })
177    else {
178        return Err(Error::MissingDistInfo);
179    };
180
181    // Like `pip`, validate that the `.dist-info` directory is prefixed with the canonical
182    // package name.
183    let normalized_prefix = DistInfoName::new(&dist_info_prefix);
184    if !normalized_prefix
185        .as_ref()
186        .starts_with(filename.name.as_str())
187    {
188        return Err(Error::MissingDistInfoPackageName(
189            dist_info_prefix,
190            filename.name.to_string(),
191        ));
192    }
193
194    Ok(dist_info_prefix)
195}
196
197/// Read the wheel `METADATA` metadata from a `.dist-info` directory.
198pub fn read_dist_info_metadata(
199    dist_info_prefix: &str,
200    wheel: impl AsRef<Path>,
201) -> Result<Vec<u8>, Error> {
202    let metadata_file = wheel
203        .as_ref()
204        .join(format!("{dist_info_prefix}.dist-info/METADATA"));
205    fs_err::read(metadata_file).map_err(Error::Io)
206}
207
208/// Read a wheel's `METADATA` file from a zip file.
209pub async fn read_metadata_async_seek(
210    filename: &WheelFilename,
211    reader: impl tokio::io::AsyncRead + tokio::io::AsyncSeek + Unpin,
212) -> Result<Vec<u8>, Error> {
213    let reader = futures::io::BufReader::new(reader.compat());
214    let mut zip_reader = async_zip::base::read::seek::ZipFileReader::new(reader).await?;
215
216    let (metadata_idx, _dist_info_prefix) = find_archive_dist_info(
217        filename,
218        zip_reader
219            .file()
220            .entries()
221            .iter()
222            .enumerate()
223            .filter_map(|(index, entry)| Some((index, entry.filename().as_str().ok()?))),
224    )?;
225
226    // Read the contents of the `METADATA` file.
227    let mut contents = Vec::new();
228    zip_reader
229        .reader_with_entry(metadata_idx)
230        .await?
231        .read_to_end_checked(&mut contents)
232        .await?;
233
234    Ok(contents)
235}
236
237/// Like [`read_metadata_async_seek`], but doesn't use seek.
238pub async fn read_metadata_async_stream<R: futures::AsyncRead + Unpin>(
239    filename: &WheelFilename,
240    debug_path: &str,
241    reader: R,
242) -> Result<ResolutionMetadata, Error> {
243    let reader = futures::io::BufReader::with_capacity(128 * 1024, reader);
244    let mut zip = async_zip::base::read::stream::ZipFileReader::new(reader);
245
246    while let Some(mut entry) = zip.next_with_entry().await? {
247        // Find the `METADATA` entry.
248        let path = entry.reader().entry().filename().as_str()?.to_owned();
249
250        if is_metadata_entry(&path, filename)? {
251            let mut reader = entry.reader_mut().compat();
252            let mut contents = Vec::new();
253            reader.read_to_end(&mut contents).await.unwrap();
254
255            // Validate the CRC of any file we unpack
256            // (It would be nice if async_zip made it harder to Not do this...)
257            let reader = reader.into_inner();
258            let computed = reader.compute_hash();
259            let expected = reader.entry().crc32();
260            if computed != expected {
261                let error = Error::BadCrc32 {
262                    path,
263                    computed,
264                    expected,
265                };
266                // There are some cases where we fail to get a proper CRC.
267                // This is probably connected to out-of-line data descriptors
268                // which are problematic to access in a streaming context.
269                // In those cases the CRC seems to reliably be stubbed inline as 0,
270                // so we downgrade this to a (hidden-by-default) warning.
271                if expected == 0 {
272                    tracing::warn!("presumed missing CRC: {error}");
273                } else {
274                    return Err(error);
275                }
276            }
277
278            let metadata = ResolutionMetadata::parse_metadata(&contents)
279                .map_err(|err| Error::InvalidMetadata(debug_path.to_string(), Box::new(err)))?;
280            return Ok(metadata);
281        }
282
283        // Close current file to get access to the next one. See docs:
284        // https://docs.rs/async_zip/0.0.16/async_zip/base/read/stream/
285        (.., zip) = entry.skip().await?;
286    }
287
288    Err(Error::MissingDistInfo)
289}
290
291/// Read the [`ResolutionMetadata`] from an unzipped wheel.
292pub fn read_flat_wheel_metadata(
293    filename: &WheelFilename,
294    wheel: impl AsRef<Path>,
295) -> Result<ResolutionMetadata, Error> {
296    let dist_info_prefix = find_flat_dist_info(filename, &wheel)?;
297    let metadata = read_dist_info_metadata(&dist_info_prefix, &wheel)?;
298    ResolutionMetadata::parse_metadata(&metadata).map_err(|err| {
299        Error::InvalidMetadata(
300            format!("{dist_info_prefix}.dist-info/METADATA"),
301            Box::new(err),
302        )
303    })
304}
305
306#[cfg(test)]
307mod test {
308    use super::find_archive_dist_info;
309    use std::str::FromStr;
310    use uv_distribution_filename::WheelFilename;
311
312    #[test]
313    fn test_dot_in_name() {
314        let files = [
315            "mastodon/Mastodon.py",
316            "mastodon/__init__.py",
317            "mastodon/streaming.py",
318            "Mastodon.py-1.5.1.dist-info/DESCRIPTION.rst",
319            "Mastodon.py-1.5.1.dist-info/metadata.json",
320            "Mastodon.py-1.5.1.dist-info/top_level.txt",
321            "Mastodon.py-1.5.1.dist-info/WHEEL",
322            "Mastodon.py-1.5.1.dist-info/METADATA",
323            "Mastodon.py-1.5.1.dist-info/RECORD",
324        ];
325        let filename = WheelFilename::from_str("Mastodon.py-1.5.1-py2.py3-none-any.whl").unwrap();
326        let (_, dist_info_prefix) =
327            find_archive_dist_info(&filename, files.into_iter().map(|file| (file, file))).unwrap();
328        assert_eq!(dist_info_prefix, "Mastodon.py-1.5.1");
329    }
330}