Skip to main content

rpm_version/
sortkey.rs

1//! Encodes an EVR (epoch, version, release) as a memcmp-sortable binary key.
2//! The returned `Vec<u8>` produces correct RPM version ordering when compared
3//! byte-by-byte (memcmp-style), matching the semantics of `rpmvercmp()`.
4//!
5//! Unlike the `Evr` struct which uses a traditional approach to
6//!
7//! The key is structured as: [4-byte epoch] [version sortkey] [release sortkey]
8//!
9//! Epoch is encoded as a 4-byte big-endian unsigned integer. Because memcmp
10//! compares left-to-right, the epoch bytes are always compared first, and a
11//! higher epoch always dominates regardless of version/release — matching RPM
12//! semantics. NULL epochs are coalesced to 0.
13//!
14//! Only ASCII alphanumeric characters participate in comparison - non-ASCII
15//! characters and non-alphanumeric ASCII characters (except ~ and ^) are
16//! treated as segment separators and skipped (matching RPM).
17//!
18//! Each version/release sortkey uses six byte markers, chosen so their numeric
19//! order matches the required RPM sort order:
20//!
21//!   \x00  — segment delimiter (terminates alpha and numeric segments)
22//!   \x01  — tilde (~), sorts lowest, before end-of-version
23//!   \x02  — end-of-version (appended once at the end of the key)
24//!   \x03  — caret (^), sorts after end but before real segments
25//!   \x04  — alpha segment prefix (followed by the segment text + \x00 delimiter)
26//!   \x05  — numeric segment prefix (followed by length byte + digits + \x00)
27//!
28//! This order directly encodes the RPM rule: ~ < end < ^ < alpha < numeric.
29//!
30//! Segment encoding details:
31//!
32//!   Alpha segments: \x04 + raw ASCII text + \x00
33//!     Lexicographic byte comparison gives correct alphabetical ordering.
34//!
35//!   Numeric segments: \x05 + length_byte + stripped_digits + \x00
36//!     Leading zeros are stripped. The length byte sorts first, so longer digit
37//!     strings (= larger numbers) always sort after shorter ones. Within the
38//!     same length, digit characters compare lexicographically, which is correct
39//!     for same-length decimal integers.
40//!
41//! Each version/release component ends with \x02 (end-of-version). When two
42//! versions are equal, the comparison naturally falls through the end marker
43//! into the release bytes. This is why no separator is needed between the
44//! version and release sortkeys — the \x02 end marker serves as a delimiter.
45//!
46//! Example: EVR "2:1.0~rc1-3.fc40" encodes as:
47//!
48//!   epoch (4-byte big-endian):
49//!     \x00 \x00 \x00 \x02            — epoch 2
50//!
51//!   version sortkey ("1.0~rc1"):
52//!     \x05 \x01 1 \x00               — numeric segment "1"
53//!     \x05 \x00 \x00                  — numeric segment "0" (leading zeros stripped, length=0)
54//!     \x01                            — tilde
55//!     \x04 rc \x00                    — alpha segment "rc"
56//!     \x05 \x01 1 \x00               — numeric segment "1"
57//!     \x02                            — end-of-version
58//!
59//!   release sortkey ("3.fc40"):
60//!     \x05 \x01 3 \x00               — numeric segment "3"
61//!     \x04 fc \x00                    — alpha segment "fc"
62//!     \x05 \x02 40 \x00              — numeric segment "40"
63//!     \x02                            — end-of-version
64//!
65//! Comparing "1.0~rc1" vs "1.0": after the shared prefix "1.0", the first key
66//! has \x01 (tilde) while the second has \x02 (end). Since \x01 < \x02,
67//! "1.0~rc1" correctly sorts before "1.0".
68
69use crate::Evr;
70
71/// Encode an EVR (epoch, version, release) as a memcmp-sortable binary key.
72///
73/// The returned `Vec<u8>` produces correct RPM version ordering when compared
74/// byte-by-byte, matching the semantics of `rpmvercmp()`.
75///
76/// This property is extremely useful, because it means that the value can be stored
77/// in a database and used to perform in-database RPM version comparisons, provided
78/// the database is capable of memcmp-style order comparisons across collections
79/// of raw bytes.
80///
81/// The key layout is: `[4-byte big-endian epoch] [version sortkey] [release sortkey]`
82///
83/// Empty or unparseable epochs are treated as 0, matching RPM semantics.
84#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
85pub struct EvrSortKey(Vec<u8>);
86
87impl EvrSortKey {
88    /// Create a sortkey from individual epoch, version, and release strings.
89    pub fn from_values(epoch: &str, version: &str, release: &str) -> Self {
90        let epoch_int: u32 = if epoch.is_empty() {
91            0
92        } else {
93            epoch.parse().unwrap_or(0)
94        };
95
96        let ver_key = version_sortkey(version);
97        let rel_key = version_sortkey(release);
98
99        let mut result = Vec::with_capacity(4 + ver_key.len() + rel_key.len());
100        result.extend_from_slice(&epoch_int.to_be_bytes());
101        result.extend(ver_key);
102        result.extend(rel_key);
103        EvrSortKey(result)
104    }
105
106    /// Parse an EVR string (e.g. `"1:2.3.4-5"`) and create a sortkey.
107    pub fn parse(evr: &str) -> Self {
108        let (epoch, version, release) = Evr::parse_values(evr);
109        Self::from_values(epoch, version, release)
110    }
111
112    /// Return a reference to the underlying bytes.
113    pub fn as_bytes(&self) -> &[u8] {
114        &self.0
115    }
116}
117
118impl From<Vec<u8>> for EvrSortKey {
119    fn from(v: Vec<u8>) -> Self {
120        EvrSortKey(v)
121    }
122}
123
124impl From<EvrSortKey> for Vec<u8> {
125    fn from(k: EvrSortKey) -> Self {
126        k.0
127    }
128}
129
130impl AsRef<[u8]> for EvrSortKey {
131    fn as_ref(&self) -> &[u8] {
132        &self.0
133    }
134}
135
136/// Encode a version or release string as a memcmp-sortable binary key.
137///
138/// The returned `Vec<u8>` can be compared with standard byte comparison
139/// (`memcmp` / `Ord for [u8]`) to produce correct RPM version ordering.
140///
141/// See [`evr_sortkey`] for the full EVR encoding.
142pub fn version_sortkey(ver: &str) -> Vec<u8> {
143    if ver.is_empty() {
144        return vec![0x02];
145    }
146
147    let bytes = ver.as_bytes();
148    let len = bytes.len();
149    let mut result = Vec::with_capacity(len * 2);
150    let mut pos = 0;
151
152    while pos < len {
153        // Skip non-alphanumeric, non-tilde, non-caret (ASCII-only)
154        while pos < len {
155            let b = bytes[pos];
156            if b.is_ascii_alphanumeric() || b == b'~' || b == b'^' {
157                break;
158            }
159            pos += 1;
160        }
161
162        if pos >= len {
163            break;
164        }
165
166        let b = bytes[pos];
167
168        if b == b'~' {
169            result.push(0x01);
170            pos += 1;
171            continue;
172        }
173
174        if b == b'^' {
175            result.push(0x03);
176            pos += 1;
177            continue;
178        }
179
180        if b.is_ascii_digit() {
181            let start = pos;
182            while pos < len && bytes[pos].is_ascii_digit() {
183                pos += 1;
184            }
185            let segment = &bytes[start..pos];
186            let stripped = segment
187                .iter()
188                .position(|&c| c != b'0')
189                .map(|i| &segment[i..])
190                .unwrap_or(b"");
191            result.push(0x05);
192            result.push(stripped.len() as u8);
193            result.extend_from_slice(stripped);
194            result.push(0x00);
195        } else {
196            let start = pos;
197            while pos < len && bytes[pos].is_ascii_alphabetic() {
198                pos += 1;
199            }
200            result.push(0x04);
201            result.extend_from_slice(&bytes[start..pos]);
202            result.push(0x00);
203        }
204    }
205
206    result.push(0x02);
207    result
208}