rpm_version/sortkey.rs
1//! Encodes an EVR (epoch, version, release) as a memcmp-sortable binary key.
2//! The returned `Vec<u8>` produces correct RPM version ordering when compared
3//! byte-by-byte (memcmp-style), matching the semantics of `rpmvercmp()`.
4//!
5//! Unlike the `Evr` struct which uses a traditional approach to
6//!
7//! The key is structured as: [4-byte epoch] [version sortkey] [release sortkey]
8//!
9//! Epoch is encoded as a 4-byte big-endian unsigned integer. Because memcmp
10//! compares left-to-right, the epoch bytes are always compared first, and a
11//! higher epoch always dominates regardless of version/release — matching RPM
12//! semantics. NULL epochs are coalesced to 0.
13//!
14//! Only ASCII alphanumeric characters participate in comparison - non-ASCII
15//! characters and non-alphanumeric ASCII characters (except ~ and ^) are
16//! treated as segment separators and skipped (matching RPM).
17//!
18//! Each version/release sortkey uses six byte markers, chosen so their numeric
19//! order matches the required RPM sort order:
20//!
21//! \x00 — segment delimiter (terminates alpha and numeric segments)
22//! \x01 — tilde (~), sorts lowest, before end-of-version
23//! \x02 — end-of-version (appended once at the end of the key)
24//! \x03 — caret (^), sorts after end but before real segments
25//! \x04 — alpha segment prefix (followed by the segment text + \x00 delimiter)
26//! \x05 — numeric segment prefix (followed by length byte + digits + \x00)
27//!
28//! This order directly encodes the RPM rule: ~ < end < ^ < alpha < numeric.
29//!
30//! Segment encoding details:
31//!
32//! Alpha segments: \x04 + raw ASCII text + \x00
33//! Lexicographic byte comparison gives correct alphabetical ordering.
34//!
35//! Numeric segments: \x05 + length_byte + stripped_digits + \x00
36//! Leading zeros are stripped. The length byte sorts first, so longer digit
37//! strings (= larger numbers) always sort after shorter ones. Within the
38//! same length, digit characters compare lexicographically, which is correct
39//! for same-length decimal integers.
40//!
41//! Each version/release component ends with \x02 (end-of-version). When two
42//! versions are equal, the comparison naturally falls through the end marker
43//! into the release bytes. This is why no separator is needed between the
44//! version and release sortkeys — the \x02 end marker serves as a delimiter.
45//!
46//! Example: EVR "2:1.0~rc1-3.fc40" encodes as:
47//!
48//! epoch (4-byte big-endian):
49//! \x00 \x00 \x00 \x02 — epoch 2
50//!
51//! version sortkey ("1.0~rc1"):
52//! \x05 \x01 1 \x00 — numeric segment "1"
53//! \x05 \x00 \x00 — numeric segment "0" (leading zeros stripped, length=0)
54//! \x01 — tilde
55//! \x04 rc \x00 — alpha segment "rc"
56//! \x05 \x01 1 \x00 — numeric segment "1"
57//! \x02 — end-of-version
58//!
59//! release sortkey ("3.fc40"):
60//! \x05 \x01 3 \x00 — numeric segment "3"
61//! \x04 fc \x00 — alpha segment "fc"
62//! \x05 \x02 40 \x00 — numeric segment "40"
63//! \x02 — end-of-version
64//!
65//! Comparing "1.0~rc1" vs "1.0": after the shared prefix "1.0", the first key
66//! has \x01 (tilde) while the second has \x02 (end). Since \x01 < \x02,
67//! "1.0~rc1" correctly sorts before "1.0".
68
69use crate::Evr;
70
71/// Encode an EVR (epoch, version, release) as a memcmp-sortable binary key.
72///
73/// The returned `Vec<u8>` produces correct RPM version ordering when compared
74/// byte-by-byte, matching the semantics of `rpmvercmp()`.
75///
76/// This property is extremely useful, because it means that the value can be stored
77/// in a database and used to perform in-database RPM version comparisons, provided
78/// the database is capable of memcmp-style order comparisons across collections
79/// of raw bytes.
80///
81/// The key layout is: `[4-byte big-endian epoch] [version sortkey] [release sortkey]`
82///
83/// Empty or unparseable epochs are treated as 0, matching RPM semantics.
84#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
85pub struct EvrSortKey(Vec<u8>);
86
87impl EvrSortKey {
88 /// Create a sortkey from individual epoch, version, and release strings.
89 pub fn from_values(epoch: &str, version: &str, release: &str) -> Self {
90 let epoch_int: u32 = if epoch.is_empty() {
91 0
92 } else {
93 epoch.parse().unwrap_or(0)
94 };
95
96 let ver_key = version_sortkey(version);
97 let rel_key = version_sortkey(release);
98
99 let mut result = Vec::with_capacity(4 + ver_key.len() + rel_key.len());
100 result.extend_from_slice(&epoch_int.to_be_bytes());
101 result.extend(ver_key);
102 result.extend(rel_key);
103 EvrSortKey(result)
104 }
105
106 /// Parse an EVR string (e.g. `"1:2.3.4-5"`) and create a sortkey.
107 pub fn parse(evr: &str) -> Self {
108 let (epoch, version, release) = Evr::parse_values(evr);
109 Self::from_values(epoch, version, release)
110 }
111
112 /// Return a reference to the underlying bytes.
113 pub fn as_bytes(&self) -> &[u8] {
114 &self.0
115 }
116}
117
118impl From<Vec<u8>> for EvrSortKey {
119 fn from(v: Vec<u8>) -> Self {
120 EvrSortKey(v)
121 }
122}
123
124impl From<EvrSortKey> for Vec<u8> {
125 fn from(k: EvrSortKey) -> Self {
126 k.0
127 }
128}
129
130impl AsRef<[u8]> for EvrSortKey {
131 fn as_ref(&self) -> &[u8] {
132 &self.0
133 }
134}
135
136/// Encode a version or release string as a memcmp-sortable binary key.
137///
138/// The returned `Vec<u8>` can be compared with standard byte comparison
139/// (`memcmp` / `Ord for [u8]`) to produce correct RPM version ordering.
140///
141/// See [`evr_sortkey`] for the full EVR encoding.
142pub fn version_sortkey(ver: &str) -> Vec<u8> {
143 if ver.is_empty() {
144 return vec![0x02];
145 }
146
147 let bytes = ver.as_bytes();
148 let len = bytes.len();
149 let mut result = Vec::with_capacity(len * 2);
150 let mut pos = 0;
151
152 while pos < len {
153 // Skip non-alphanumeric, non-tilde, non-caret (ASCII-only)
154 while pos < len {
155 let b = bytes[pos];
156 if b.is_ascii_alphanumeric() || b == b'~' || b == b'^' {
157 break;
158 }
159 pos += 1;
160 }
161
162 if pos >= len {
163 break;
164 }
165
166 let b = bytes[pos];
167
168 if b == b'~' {
169 result.push(0x01);
170 pos += 1;
171 continue;
172 }
173
174 if b == b'^' {
175 result.push(0x03);
176 pos += 1;
177 continue;
178 }
179
180 if b.is_ascii_digit() {
181 let start = pos;
182 while pos < len && bytes[pos].is_ascii_digit() {
183 pos += 1;
184 }
185 let segment = &bytes[start..pos];
186 let stripped = segment
187 .iter()
188 .position(|&c| c != b'0')
189 .map(|i| &segment[i..])
190 .unwrap_or(b"");
191 result.push(0x05);
192 result.push(stripped.len() as u8);
193 result.extend_from_slice(stripped);
194 result.push(0x00);
195 } else {
196 let start = pos;
197 while pos < len && bytes[pos].is_ascii_alphabetic() {
198 pos += 1;
199 }
200 result.push(0x04);
201 result.extend_from_slice(&bytes[start..pos]);
202 result.push(0x00);
203 }
204 }
205
206 result.push(0x02);
207 result
208}