lucky_commit/lib.rs
1//! The underlying API of lucky_commit is also exposed as a Rust library, in case anyone
2//! wants to use it programmatically. However, note that the library API is considered
3//! unstable, and might have backwards-incompatible changes even in minor or patch
4//! releases of the crate. If you use the library interface, pinning to an exact version
5//! is recommended.
6
7#![deny(missing_docs)]
8
9use std::{
10 cmp::Ord,
11 error::Error,
12 fmt::{Debug, Display, Formatter},
13 iter, mem,
14 ops::Range,
15 str::FromStr,
16 sync::{
17 atomic::{AtomicBool, Ordering},
18 mpsc, Arc,
19 },
20 thread::{self, JoinHandle},
21};
22
23#[cfg(feature = "opencl")]
24use ocl::{
25 builders::DeviceSpecifier::TypeFlags,
26 flags::{DeviceType, MemFlags},
27 prm::Uint16,
28 Buffer, Context, Kernel, Platform, Program, Queue,
29};
30#[cfg(feature = "opencl")]
31use std::convert::TryInto;
32
33/// A worker that, when invoked, will look in a predetermined search space to find a modification
34/// to a specific commit that matches a specific hash spec.
35#[derive(Debug, PartialEq)]
36pub struct HashSearchWorker<H: GitHashFn> {
37 processed_commit: ProcessedCommit,
38 hash_spec: HashSpec<H>,
39 search_space: Range<u64>,
40}
41
42// The fully padded data that gets hashed is the concatenation of all the following:
43// |--- GIT COMMIT HEADER (part of git's raw commit format) ---
44// | * The ASCII string "commit "
45// | * The byte-length of the entire "git commit" section below, represented as base-10 ASCII digits
46// | * A null byte (0x0)
47// |--- GIT COMMIT ---
48// | * The original git commit object that was provided as input, in git's normal commit encoding, up
49// | to the "padding insertion point". (For commits that are not GPG-signed, the padding insertion
50// | point is right near the end of the commit. For commits that are GPG-signed, the padding insertion
51// | point is at the end of the signature, which is right before the commit message.) This section
52// | contains metadata such as the author, timestamp, and parent commit.
53// | * Some number of ASCII space characters, as "static padding", such that the point after the static
54// | padding is at a multiple-of-64-byte offset from the start of the data. Note that in very rare
55// | pathological cases, more than 63 space characters will be needed. This is because adding static
56// | padding also increases the length of the commit object, which is used in the git commit header
57// | above. As a result, adding one additional padding character could increase the alignment by 2,
58// | e.g. if the length increases from 999 to 1000.
59// |
60// | - NOTE: The length of all the data up to this point is a multiple of 64 bytes, and the length
61// | of the all the data after this point is also a multiple of 64 bytes. For reasons that will
62// | be explained in the declaration of `PartiallyHashedCommit`, the 64-byte blocks preceding
63// | this point are called "static blocks", and the 64-byte blocks following this point are called
64// | "dynamic blocks".
65// |
66// | * 48 bytes of "dynamic padding", consisting of some combination of ASCII space and tab characters.
67// | This is the only part of the commit data that actually varies across hash invocations. The ultimate
68// | goal is to find a dynamic padding arrangement that produces the desired hash. A 48-byte length was
69// | chosen with the goal of only needing a single dynamic block for non-GPG-signed commits.
70// | * The rest of the original commit object (from the "padding insertion point" onwards). For
71// | non-GPG-signed commits, this will typically just be a single newline. For GPG-signed commits, this
72// | will contain the commit message.
73// |--- SHA1/SHA256 FINALIZATION PADDING (specified as part of the SHA1/SHA256 algorithm) ---
74// | * The byte 0x80
75// | * Up to 63 null bytes (0x0), such that the point after the null bytes is at an offset of 56 (mod 64) bytes
76// | from the start of the data
77// | * The bit-length of everything before the "finalization padding" section, represented as a big-endian
78// | 64-bit integer
79#[derive(Debug, PartialEq, Clone)]
80struct ProcessedCommit {
81 /// The data, as specified in the comment above. The length will always be a multiple of 64 bytes.
82 data: Box<[u8]>,
83 /// The location of the git commit, as an index range into `data`
84 commit_range: Range<usize>,
85 /// The number of 64-byte static blocks in the data
86 num_static_blocks: usize,
87}
88
89/// A view of an underlying `ProcessedCommit`, with cached hash state.
90///
91/// SHA1 and SHA256 work as follows:
92///
93/// * First, a 20-byte or 32-byte "state vector" is initialized to a constant.
94/// * Next, each each 64-byte block in the input is processed in sequence. The state vector after
95/// processing a block is a convoluted, deteriministic function of (a) the state vector before
96/// processing the block, and (b) the contents of the block. Processing blocks is the main performance
97/// bottleneck of lucky-commit.
98/// * The "hash" of some data is just the contents of the state vector after processing all of
99/// the data (with finalization padding added to the end of the data, as described in the comment
100/// about the `ProcessedCommit` format).
101///
102/// So there's a big optimization we can do here -- we have to compute a bunch of hashes, but the
103/// only part of the data that we're changing between runs is the dynamic padding, which is very close
104/// to the end of the data. The state vector after processing all of the blocks before the dynamic
105/// padding (the "static blocks") doesn't depend at all on the contents of the dynamic padding -- it's
106/// effectively a constant for any given `ProcessedCommit`. The purpose of `PartiallyHashedCommit` is
107/// to cache that state vector, and only reprocess the "dynamic blocks" on each change to the dynamic
108/// padding. This drastically reduces the number of blocks that need to be processed, resulting in a
109/// ~5x end-to-end performance improvement for an average-sized commit.
110#[derive(Debug)]
111struct PartiallyHashedCommit<'a, H: GitHashFn> {
112 intermediate_state: H::State,
113 dynamic_blocks: &'a mut [H::Block],
114}
115
116/// Defines a spec for a desired commit hash.
117#[derive(Debug, PartialEq, Clone)]
118pub struct HashSpec<H: GitHashFn> {
119 /// The data in the desired hash, as split into big-endian four-byte chunks.
120 /// All bits that are unspecified (e.g. bits corresponding to the end of the hash, when only a prefix is being matched)
121 /// are set to zero.
122 data: H::State,
123 /// Mask containing bits set to 1 if the bit at that position is specified, and 0 otherwise.
124 mask: H::State,
125 // For example, the sha1 hash prefix "deadbeef123" corresponds to the
126 // following spec:
127 // HashSpec { data: [0xdeadbeef, 0x12300000, 0, 0, 0], mask: [0xffffffff, 0xfff00000, 0, 0, 0] }
128}
129
130/// An error that results from parsing an invalid HashSpec
131#[non_exhaustive]
132#[derive(PartialEq, Eq)]
133pub enum ParseHashSpecErr {
134 /// The input string is longer than a hash with the specified algorithm
135 TooLong,
136 /// The input string contains characters which are neither hex characters nor '_'.
137 InvalidCharacter(char),
138}
139
140/// A git commit
141#[derive(PartialEq, Eq)]
142pub struct GitCommit<H: GitHashFn> {
143 /// The commit data, represented in git's object format
144 object: Vec<u8>,
145
146 /// The hash of the commit
147 hash: H::State,
148}
149
150/// A hash function used by git. This is a sealed trait implemented by `Sha1` and `Sha256`.
151/// The fields and methods on this trait are subject to change. Consumers should pretend that
152/// the types implementing the trait are opaque.
153pub trait GitHashFn: private::Sealed + Debug + Send + Clone + Eq + 'static {
154 /// The type of the output and intermediate state of this hash function.
155 /// For sha1 and sha256, this is [u32; N] for some N. Ideally this trait would just
156 /// have an associated const for the length of the state vector, and then
157 /// `State` would be defined as `[u32; N]`, but this isn't possible due
158 /// to <https://github.com/rust-lang/rust/issues/60551>.
159 type State: AsRef<[u32]> + AsMut<[u32]> + Clone + Copy + Debug + Default + Eq + Send;
160
161 /// The initial value of the state vector for the given algorithm
162 const INITIAL_STATE: Self::State;
163
164 /// The datatype representing a block for this algorithm. This must be layout-equivalent
165 /// to [u8; 64], although the nominal type that gets used might be different on a
166 /// per-library basis due to const generic limitations.
167 type Block: AsRef<[u8]> + AsMut<[u8]> + Copy + Debug;
168
169 /// Processes a set of blocks using the given algorithm
170 fn compress(state: &mut Self::State, blocks: &[Self::Block]);
171
172 #[cfg(feature = "opencl")]
173 /// Source code of an OpenCL shader kernel finding hash matches for the given
174 /// algorithm. The kernel should have a function `scatter_padding_and_find_match`, which
175 /// accepts the following parameters:
176 /// 1. A pointer to the `data` in the desired hash spec (pointing to the appropriate
177 /// number of bytes for the given hash algorithm)
178 /// 1. A pointer to the `mask` of the desired hash spec
179 /// 1. The "base padding specifier" for the current run, which determines which padding will
180 /// be attempted. The padding specifier used by any given thread is equal to the base
181 /// specifier plus that thread's ID.
182 /// 1. A pointer to the intermediate state after all static blocks have been hashed
183 /// 1. A pointer to the dynamic blocks, encoded as big-endian 32-bit integers
184 /// 1. The number of dynamic blocks that are present
185 /// 1. A writeable pointer where the shader should write a thread ID if it finds an appropriate
186 /// match.
187 const KERNEL: &'static str;
188}
189
190/// The hash type used for Sha1 git repositories (the default at the time of writing)
191/// This type is uninhabited, and is only intended to be used as a type parameter.
192#[derive(Debug, Clone, Eq, PartialEq)]
193pub enum Sha1 {}
194impl GitHashFn for Sha1 {
195 type State = [u32; 5];
196
197 const INITIAL_STATE: Self::State = [0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0];
198 type Block = sha1::digest::core_api::Block<sha1::Sha1Core>;
199
200 fn compress(state: &mut Self::State, blocks: &[Self::Block]) {
201 sha1::compress(state, blocks)
202 }
203
204 #[cfg(feature = "opencl")]
205 const KERNEL: &'static str = include_str!("sha1_matcher.cl");
206}
207
208/// The hash type used for Sha256 git repositories.
209/// This type is uninhabited, and is only intended to be used as a type parameter.
210#[derive(Debug, Clone, Eq, PartialEq)]
211pub enum Sha256 {}
212impl GitHashFn for Sha256 {
213 type State = [u32; 8];
214
215 const INITIAL_STATE: Self::State = [
216 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab,
217 0x5be0cd19,
218 ];
219 type Block = sha2::digest::core_api::Block<sha2::Sha256>;
220
221 fn compress(state: &mut Self::State, blocks: &[Self::Block]) {
222 sha2::compress256(state, blocks)
223 }
224
225 #[cfg(feature = "opencl")]
226 const KERNEL: &'static str = include_str!("sha256_matcher.cl");
227}
228
229mod private {
230 pub trait Sealed {}
231 impl Sealed for super::Sha1 {}
232 impl Sealed for super::Sha256 {}
233}
234
235impl<H: GitHashFn> HashSearchWorker<H> {
236 /// Creates a worker for a specific commit and hash spec, with an initial
237 /// search space of 2**48.
238 pub fn new(current_commit: &[u8], hash_spec: HashSpec<H>) -> Self {
239 Self {
240 processed_commit: ProcessedCommit::new(current_commit),
241 hash_spec,
242 search_space: 0..(1 << 48),
243 }
244 }
245
246 /// Caps a worker's search space to approximately the given size.
247 pub fn with_capped_search_space(mut self, workload: u64) -> Self {
248 self.search_space = self.search_space.start
249 ..Ord::min(self.search_space.end, self.search_space.start + workload);
250 self
251 }
252
253 /// Splits this worker into `divisor` new workers for the same commit and
254 /// desired hash, with the search space split roughly equally.
255 /// A worker's search space is an approximation to help with effecient threading. There is
256 /// no guarantee that the resulting workers have perfectly disjoint search spaces, so in theory
257 /// multiple workers could both find the same hash match despite having "split" the space.
258 fn split_search_space(self, divisor: u64) -> impl Iterator<Item = Self> {
259 let amount_per_worker = (self.search_space.end - self.search_space.start) / divisor;
260 (0..divisor).map(move |index| {
261 let range_start = index * amount_per_worker + self.search_space.start;
262 let range_end = if index < divisor - 1 {
263 range_start + amount_per_worker
264 } else {
265 // In case the work can't be divided perfectly, just give all the slack to the last
266 // worker. Typically, `amount_per_worker` will be many orders of magnitude larger
267 // than `divisor`, so having a few extra units of work is immaterial.
268 self.search_space.end
269 };
270 Self {
271 processed_commit: self.processed_commit.clone(),
272 hash_spec: self.hash_spec.clone(),
273 search_space: range_start..range_end,
274 }
275 })
276 }
277
278 /// Invokes the worker. The worker will return a git commit matching the hash,
279 /// if it finds one. Otherwise, it will return None after exhausing its entire search space.
280 pub fn search(self) -> Option<GitCommit<H>> {
281 #[cfg(feature = "opencl")]
282 if Self::gpus_available() {
283 return self.search_with_gpu().unwrap();
284 }
285
286 self.search_with_cpus()
287 }
288
289 #[cfg(feature = "opencl")]
290 fn gpus_available() -> bool {
291 Platform::list().iter().any(|platform| {
292 platform.name().map_or_else(
293 |_| {
294 eprintln!("Platform {:?} is not okay.", platform);
295 false
296 },
297 |_| {
298 TypeFlags(DeviceType::GPU)
299 .to_device_list(Some(*platform))
300 .map_or_else(
301 |e| {
302 eprintln!(
303 "Failed to get GPU devices for platform {:?}: {}",
304 platform, e
305 );
306 false
307 },
308 |devices| !devices.is_empty(),
309 )
310 },
311 )
312 })
313 }
314
315 #[allow(clippy::needless_collect)]
316 fn search_with_cpus(self) -> Option<GitCommit<H>> {
317 let thread_count = num_cpus::get_physical();
318 let lame_duck_cancel_signal = Arc::new(AtomicBool::new(false));
319 let (shared_sender, receiver) = mpsc::channel();
320
321 let _handles = self
322 .split_search_space(thread_count as u64)
323 .map(|worker| {
324 let result_sender = shared_sender.clone();
325 let worker_cancel_signal = Arc::clone(&lame_duck_cancel_signal);
326
327 thread::spawn(move || {
328 let _ = result_sender
329 .send(worker.search_with_cpu_single_threaded(worker_cancel_signal));
330 })
331 })
332 .collect::<Vec<JoinHandle<()>>>();
333
334 for _ in 0..thread_count {
335 if let Some(result) = receiver.recv().unwrap() {
336 lame_duck_cancel_signal.store(true, Ordering::Relaxed);
337
338 // Lame-duck threads should halt shortly after any thread finds a match. However,
339 // we don't want to actually wait for them to halt when running in production, especially
340 // since the process will usually terminate shortly afterwards anyway. So the waiting
341 // and panic detection is debug/test-only
342 #[cfg(debug_assertions)]
343 _handles
344 .into_iter()
345 .map(JoinHandle::join)
346 .collect::<Result<Vec<_>, _>>()
347 .unwrap();
348
349 return Some(result);
350 }
351 }
352
353 None
354 }
355
356 #[inline(never)]
357 fn search_with_cpu_single_threaded(
358 self,
359 lame_duck_cancel_signal: Arc<AtomicBool>,
360 ) -> Option<GitCommit<H>> {
361 let HashSearchWorker {
362 search_space,
363 hash_spec,
364 mut processed_commit,
365 ..
366 } = self;
367 let mut partially_hashed_commit = processed_commit.as_partially_hashed_commit::<H>();
368
369 let lame_duck_check_interval = Ord::min(search_space.end - search_space.start, 1 << 20);
370 for base_padding_specifier in search_space.step_by(lame_duck_check_interval as usize) {
371 for index_in_interval in 0..lame_duck_check_interval {
372 partially_hashed_commit.scatter_padding(base_padding_specifier + index_in_interval);
373 if hash_spec.matches(&partially_hashed_commit.current_hash()) {
374 return Some(GitCommit::new(processed_commit.commit()));
375 }
376 }
377
378 if lame_duck_cancel_signal.load(Ordering::Relaxed) {
379 break;
380 }
381 }
382
383 None
384 }
385
386 #[cfg(feature = "opencl")]
387 fn search_with_gpu(self) -> ocl::Result<Option<GitCommit<H>>> {
388 let HashSearchWorker {
389 search_space,
390 hash_spec,
391 mut processed_commit,
392 ..
393 } = self;
394 let mut partially_hashed_commit = processed_commit.as_partially_hashed_commit::<H>();
395
396 let num_threads = *[
397 hash_spec.estimated_attempts_needed().saturating_mul(4),
398 search_space.end - search_space.start,
399 1 << 22,
400 ]
401 .iter()
402 .min()
403 .unwrap() as usize;
404
405 assert!(num_threads < u32::MAX as usize);
406
407 let devices = Platform::list()
408 .iter()
409 .find_map(|platform| {
410 platform.name().ok().and_then(|_| {
411 TypeFlags(DeviceType::GPU)
412 .to_device_list(Some(*platform))
413 .ok()
414 .and_then(|devices| devices.get(0).cloned())
415 })
416 })
417 .ok_or_else(|| ocl::Error::from("No GPU devices found."))?;
418 let context = Context::builder().devices(devices).build()?;
419 let queue = Queue::new(&context, devices, None)?;
420
421 let mut successful_match_receiver_host_handle = [u32::MAX];
422 let successful_match_receiver = Buffer::builder()
423 .queue(queue.clone())
424 .len(1)
425 .flags(MemFlags::READ_WRITE)
426 .copy_host_slice(&successful_match_receiver_host_handle)
427 .build()?;
428
429 const BASE_PADDING_SPECIFIER_ARG: &str = "base_padding_specifier";
430 let kernel = Kernel::builder()
431 .name("scatter_padding_and_find_match")
432 .program(
433 &Program::builder()
434 .src(H::KERNEL)
435 .cmplr_opt("-Werror")
436 .build(&context)?,
437 )
438 .arg(
439 &Buffer::builder()
440 .queue(queue.clone())
441 .len(hash_spec.data.as_ref().len())
442 .flags(MemFlags::READ_ONLY)
443 .copy_host_slice(hash_spec.data.as_ref())
444 .build()?,
445 )
446 .arg(
447 &Buffer::builder()
448 .queue(queue.clone())
449 .len(hash_spec.mask.as_ref().len())
450 .flags(MemFlags::READ_ONLY)
451 .copy_host_slice(hash_spec.mask.as_ref())
452 .build()?,
453 )
454 .arg(
455 &Buffer::builder()
456 .queue(queue.clone())
457 .len(partially_hashed_commit.intermediate_state.as_ref().len())
458 .flags(MemFlags::READ_ONLY)
459 .copy_host_slice(partially_hashed_commit.intermediate_state.as_ref())
460 .build()?,
461 )
462 .arg_named(BASE_PADDING_SPECIFIER_ARG, 0u64) // filled in later
463 .arg(
464 &Buffer::builder()
465 .queue(queue.clone())
466 .len(partially_hashed_commit.dynamic_blocks.len())
467 .flags(MemFlags::READ_ONLY)
468 .copy_host_slice(
469 &partially_hashed_commit
470 .dynamic_blocks
471 .iter()
472 .map(|&block| encode_into_opencl_vector::<H>(block))
473 .collect::<Vec<_>>(),
474 )
475 .build()?,
476 )
477 .arg(partially_hashed_commit.dynamic_blocks.len() as u64)
478 .arg(&successful_match_receiver)
479 .queue(queue)
480 .global_work_size(num_threads)
481 .build()?;
482
483 for base_padding_specifier in search_space.step_by(num_threads) {
484 kernel.set_arg(BASE_PADDING_SPECIFIER_ARG, base_padding_specifier)?;
485
486 // SAFETY: The OpenCL scripts are optimistically assumed to have no memory safety issues
487 unsafe {
488 kernel.enq()?;
489 }
490
491 successful_match_receiver
492 .read(&mut successful_match_receiver_host_handle[..])
493 .enq()?;
494
495 if successful_match_receiver_host_handle[0] != u32::MAX {
496 let successful_padding_specifier =
497 base_padding_specifier + (successful_match_receiver_host_handle[0] as u64);
498 partially_hashed_commit.scatter_padding(successful_padding_specifier);
499
500 assert!(
501 hash_spec.matches(&partially_hashed_commit.current_hash()),
502 "\
503 A GPU search reported a commit with a successful match, but when that \
504 commit was hashed in postprocessing, it didn't match the desired spec. \
505 This is a bug. The most likely explanation is that the two implementations of \
506 `scatter_padding` in Rust and OpenCL (or the implementations of SHA1/SHA256) have diverged \
507 from each other.\n\npartial commit:\n\t{:?}\ndesired hash spec:\n\t{:?}\ncommit hash \
508 produced during postprocessing:{:?}\n\tpadding specifier: {}",
509 partially_hashed_commit,
510 hash_spec,
511 partially_hashed_commit.current_hash(),
512 successful_padding_specifier,
513 );
514
515 return Ok(Some(GitCommit::new(processed_commit.commit())));
516 }
517 }
518
519 Ok(None)
520 }
521}
522
523impl ProcessedCommit {
524 const DYNAMIC_PADDING_LENGTH: usize = 48;
525
526 /// See the comment above the definition of `ProcessedCommit` for details on how
527 /// the data layout.
528 fn new(original_commit: &[u8]) -> Self {
529 let padding_insertion_point = Self::get_padding_insertion_point(original_commit);
530
531 // If the commit message already has spaces or tabs where we're putting padding, the most
532 // likely explanation is that the user has run lucky-commit on this commit before. To prevent
533 // commits from repeatedly growing after lucky-commit is run on them, omit the old padding
534 // rather than piling onto it.
535 let replaceable_padding_size = original_commit[padding_insertion_point..]
536 .iter()
537 .take_while(|&&byte| byte == b' ' || byte == b'\t')
538 .count();
539
540 // Use enough static padding to pad to a multiple of 64
541 let static_padding_length = Self::compute_static_padding_length(
542 padding_insertion_point,
543 original_commit.len() - replaceable_padding_size + Self::DYNAMIC_PADDING_LENGTH,
544 );
545
546 let commit_length = original_commit.len() - replaceable_padding_size
547 + static_padding_length
548 + Self::DYNAMIC_PADDING_LENGTH;
549
550 // Git commit header
551 let mut data = format!("commit {}\0", commit_length).into_bytes();
552
553 let commit_range = data.len()..(data.len() + commit_length);
554
555 // First part of commit
556 data.extend(&original_commit[..padding_insertion_point]);
557
558 // Static padding
559 data.resize(data.len() + static_padding_length, b' ');
560
561 assert_eq!(data.len() % 64, 0);
562 let num_static_blocks = data.len() / 64;
563
564 // Dynamic padding, initialized to tabs for now
565 data.resize(data.len() + Self::DYNAMIC_PADDING_LENGTH, b'\t');
566
567 // Second part of commit
568 data.extend(&original_commit[padding_insertion_point + replaceable_padding_size..]);
569
570 assert_eq!(data.len(), commit_range.end);
571
572 // SHA finalization padding
573 data.extend(sha_finalization_padding(data.len()));
574 assert_eq!(data.len() % 64, 0);
575
576 Self {
577 data: data.into_boxed_slice(),
578 commit_range,
579 num_static_blocks,
580 }
581 }
582
583 /// Finds the index at which static and dynamic padding should be inserted into a commit.
584 ///
585 /// If the commit has a GPG signature (detected by the presence of "-----END PGP SIGNATURE-----"
586 /// after a line that starts with "gpgsig "), then add the padding whitespace immediately after
587 /// the text "-----END PGP SIGNATURE-----".
588 /// Otherwise, add the padding whitespace right before the end of the commit message.
589 ///
590 /// To save time hashing, we want the padding to be as close to the end of the commit
591 /// as possible. However, if a signature is present, modifying the commit message would make
592 /// the signature invalid.
593 fn get_padding_insertion_point(commit: &[u8]) -> usize {
594 // Check if the commit has a signature header before the start of the commit message
595 let insertion_point_plus_preexisting_padding = (0..commit.len())
596 .take_while(|&i| !commit[i..].starts_with(b"\n\n"))
597 .find(|&i| {
598 commit[i..].starts_with(b"\ngpgsig ")
599 || commit[i..].starts_with(b"\ngpgsig-sha256 ")
600 })
601 .map(|i| i + 1)
602 // If so, put the padding right at the end of that header
603 .and_then(|signature_header_start_index| {
604 (signature_header_start_index..commit.len())
605 .find(|&i| commit[i..].starts_with(b"\n") && !commit[i..].starts_with(b"\n "))
606 })
607 .unwrap_or(commit.len());
608
609 return insertion_point_plus_preexisting_padding
610 - commit[..insertion_point_plus_preexisting_padding]
611 .iter()
612 .rev()
613 .take_while(|&&byte| byte == b' ' || byte == b'\t' || byte == b'\n')
614 .count();
615 }
616
617 /// Returns the smallest nonnegative integer `static_padding_length` such that:
618 /// static_padding_length
619 /// + commit_length_before_static_padding
620 /// + 8
621 /// + (number of digits in the base10 representation of
622 /// commit_length_excluding_static_padding + static_padding_length)
623 /// is a multiple of 64.
624 ///
625 /// The 8 comes from the length of the word `commit`, plus a space and a null character, in
626 /// git's commit hashing format.
627 fn compute_static_padding_length(
628 commit_length_before_static_padding: usize,
629 commit_length_excluding_static_padding: usize,
630 ) -> usize {
631 let compute_alignment = |padding_len: usize| {
632 (format!(
633 "commit {}\0",
634 commit_length_excluding_static_padding + padding_len
635 )
636 .len()
637 + commit_length_before_static_padding
638 + padding_len)
639 % 64
640 };
641 let prefix_length_estimate = format!("commit {}\0", commit_length_excluding_static_padding)
642 .len()
643 + commit_length_before_static_padding;
644 let initial_padding_length_guess = (64 - prefix_length_estimate % 64) % 64;
645
646 let static_padding_length = if compute_alignment(initial_padding_length_guess) == 0 {
647 initial_padding_length_guess
648 } else if compute_alignment(initial_padding_length_guess - 1) == 0 {
649 initial_padding_length_guess - 1
650 } else {
651 initial_padding_length_guess + 63
652 };
653
654 assert_eq!(compute_alignment(static_padding_length), 0);
655 debug_assert!((0..static_padding_length).all(|len| compute_alignment(len) != 0));
656
657 static_padding_length
658 }
659
660 fn commit(&self) -> &[u8] {
661 &self.data[self.commit_range.clone()]
662 }
663
664 fn as_partially_hashed_commit<H: GitHashFn>(&mut self) -> PartiallyHashedCommit<H> {
665 let (static_blocks, dynamic_blocks) =
666 as_chunks_mut::<H>(&mut self.data[..]).split_at_mut(self.num_static_blocks);
667
668 let mut intermediate_state = H::INITIAL_STATE;
669 H::compress(&mut intermediate_state, static_blocks);
670
671 PartiallyHashedCommit {
672 intermediate_state,
673 dynamic_blocks,
674 }
675 }
676}
677
678impl<'a, H: GitHashFn> PartiallyHashedCommit<'a, H> {
679 #[inline(always)]
680 fn dynamic_padding_mut(&mut self) -> &mut [u8] {
681 &mut self.dynamic_blocks[0].as_mut()[..48]
682 }
683
684 // This should be kept in sync with the OpenCL `arrange_padding_block` implementation.
685 #[inline(always)]
686 fn scatter_padding(&mut self, padding_specifier: u64) {
687 // The 256 unique strings of length 8 which contain only ' ' and '\t'.
688 // These are computed at compile-time to allow them to be copied quickly.
689 static PADDING_CHUNKS: [[u8; 8]; 256] = {
690 let mut padding_chunks = [[0; 8]; 256];
691 let mut i = 0;
692 while i < 256 {
693 let mut j = 0;
694 while j < 8 {
695 padding_chunks[i][j] = if i & (0x80 >> j) == 0 { b' ' } else { b'\t' };
696 j += 1;
697 }
698 i += 1;
699 }
700 padding_chunks
701 };
702
703 self.dynamic_padding_mut()
704 .chunks_exact_mut(8)
705 .zip(padding_specifier.to_le_bytes().iter())
706 .for_each(|(padding_chunk, &padding_specifier_byte)| {
707 // An padding specifier is represented by an integer in the range [0, 2 ** 48).
708 // The 48-byte dynamic padding string is mapped from the 48-bit specifier such that
709 // each byte of padding is a [space/tab] if the corresponding little-endian bit of
710 // the specifier is a [0/1], respectively.
711 padding_chunk.copy_from_slice(&PADDING_CHUNKS[padding_specifier_byte as usize]);
712 })
713 }
714
715 #[inline(always)]
716 fn current_hash(&self) -> H::State {
717 let mut hash = self.intermediate_state;
718 H::compress(&mut hash, self.dynamic_blocks);
719 hash
720 }
721}
722
723impl<H: GitHashFn> HashSpec<H> {
724 #[inline(always)]
725 fn matches(&self, hash: &H::State) -> bool {
726 hash.as_ref()
727 .iter()
728 .zip(self.mask.as_ref().iter())
729 .map(|(&hash_word, &mask_word)| hash_word & mask_word)
730 .zip(self.data.as_ref().iter())
731 .all(|(masked_hash_word, &hash_spec_word)| masked_hash_word == hash_spec_word)
732 }
733
734 #[cfg(feature = "opencl")]
735 fn estimated_attempts_needed(&self) -> u64 {
736 2u64.saturating_pow(
737 self.mask
738 .as_ref()
739 .iter()
740 .map(|word| word.count_ones())
741 .sum(),
742 )
743 }
744}
745
746impl<H: GitHashFn> FromStr for HashSpec<H> {
747 type Err = ParseHashSpecErr;
748 /// Parses a HashSpec from a string. The string must only contain hex characters (0-9, a-f, A-F), indicating the hex
749 /// value that the hash should have at a given position, or `_`, indicating that the hash can have any value at the given
750 /// position. All positions in the hash beyond the length of the string are treated as unspecified (equivalent to if the
751 /// string was right-padded with `_`).
752 fn from_str(prefix_string: &str) -> Result<Self, Self::Err> {
753 let max_hex_character_length = mem::size_of::<H::State>() * 2;
754 if prefix_string.chars().count() > max_hex_character_length {
755 return Err(ParseHashSpecErr::TooLong);
756 }
757
758 let mut parsed_hash_spec = HashSpec::<H> {
759 // Zero-initialize the data and mask
760 data: H::State::default(),
761 mask: H::State::default(),
762 };
763 prefix_string
764 .chars()
765 // Pad the input string out to the length of a hash
766 .chain(iter::repeat('_'))
767 .take(max_hex_character_length)
768 // Split it into 8-hex-character chunks
769 .collect::<Vec<_>>()
770 .chunks(8)
771 // Associate each 8-hex-character chunk with corresponding 32-bit word of the hash spec
772 .zip(parsed_hash_spec.data.as_mut())
773 .zip(parsed_hash_spec.mask.as_mut())
774 .try_for_each(|((hash_spec_chunk, data_word), mask_word)| {
775 hash_spec_chunk
776 .iter()
777 .zip((0..32).step_by(4).rev())
778 .try_for_each(|(&hash_spec_character, slot_bit_offset)| {
779 // Parse each hex character of the input string and write it to the appropriate slots of the hash spec.
780 if let Some(hex_character_value) = hash_spec_character.to_digit(16) {
781 *data_word |= hex_character_value << slot_bit_offset;
782 *mask_word |= 0xf << slot_bit_offset;
783 } else if hash_spec_character != '_' {
784 // The '_' character in a hash spec is allowed as an "any value is allowed here" placeholder
785 // (corresponds to a 0 in both the data slot and the mask slot). All other non-hex characters are
786 // disallowed.
787 return Err(ParseHashSpecErr::InvalidCharacter(hash_spec_character));
788 }
789 Ok(())
790 })
791 })?;
792
793 Ok(parsed_hash_spec)
794 }
795}
796
797impl<H: GitHashFn> Default for HashSpec<H> {
798 fn default() -> Self {
799 "0000000".parse().unwrap()
800 }
801}
802
803impl Error for ParseHashSpecErr {}
804impl Display for ParseHashSpecErr {
805 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
806 match *self {
807 Self::TooLong => write!(f, "hash spec can't be longer than an actual hash"),
808 Self::InvalidCharacter(c) => write!(f, "hash spec contains invalid character '{}' (only hex characters and underscores are allowed)", c),
809 }
810 }
811}
812impl Debug for ParseHashSpecErr {
813 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
814 <Self as Display>::fmt(self, f)
815 }
816}
817
818impl<H: GitHashFn> GitCommit<H> {
819 /// Constructs a GitCommit from the given commit data. The data is assumed to be in
820 /// git's object format, but this is not technically required.
821 pub fn new(commit: &[u8]) -> Self {
822 Self {
823 object: commit.to_vec(),
824 hash: {
825 let mut state = H::INITIAL_STATE;
826 let commit_header = format!("commit {}\0", commit.len()).into_bytes();
827 let commit_data_length = commit_header.len() + commit.len();
828
829 H::compress(
830 &mut state,
831 as_chunks_mut::<H>(
832 commit_header
833 .into_iter()
834 .chain(commit.iter().cloned())
835 .chain(sha_finalization_padding(commit_data_length))
836 .collect::<Vec<_>>()
837 .as_mut(),
838 ),
839 );
840 state
841 },
842 }
843 }
844
845 /// The git object data for this commit
846 pub fn object(&self) -> &[u8] {
847 &self.object
848 }
849
850 /// The hash of this commit, as a hex string
851 pub fn hex_hash(&self) -> String {
852 self.hash
853 .as_ref()
854 .iter()
855 .map(|word| format!("{:08x}", word))
856 .collect()
857 }
858}
859
860impl<H: GitHashFn> Debug for GitCommit<H> {
861 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
862 write!(
863 f,
864 "GitCommit {{ object: {:?}, hash: {:?} }}",
865 String::from_utf8_lossy(&self.object),
866 self.hex_hash()
867 )
868 }
869}
870
871#[cfg(feature = "opencl")]
872/// Reinterpret a block with 64 8-bit integers as an OpenCL vector with 16 32-bit big-endian integers
873fn encode_into_opencl_vector<H: GitHashFn>(data: H::Block) -> Uint16 {
874 let words: [u32; 16] = data
875 .as_ref()
876 .chunks(4)
877 .map(|chunk| u32::from_be_bytes(chunk.try_into().unwrap()))
878 .collect::<Vec<_>>()
879 .try_into()
880 .unwrap();
881
882 words.into()
883}
884
885// This is a modified implementation of std::slice::as_chunks_mut. It's copied because the
886// standard library function is not yet stable. As a local safety invariant, `Block` is expected
887// to be layout-identical to `[u8; 64]`. (It's a generic parameter because the `GenericArray` subdependency
888// could technically end up being at different versions between the sha1 and sha2 crates, which would cause
889// compile errors if the sha1 version of `GenericArray` gets passed to sha2 methods.
890fn as_chunks_mut<H: GitHashFn>(slice: &mut [u8]) -> &mut [H::Block] {
891 assert_eq!(mem::size_of::<H::Block>(), 64);
892 assert_eq!(mem::align_of::<H::Block>(), mem::align_of::<u8>());
893 assert_eq!(slice.len() % mem::size_of::<H::Block>(), 0);
894 // SAFETY:
895 // * All of the bytes in the slice are initialized, and the alignment of u8 and [u8; 64]
896 // are the same.
897 // * The slice length is a multiple of 64, and so the slice's pointer points to
898 // the same number of elements as the resulting pointer.
899 // * Since `slice` is mutable, its values aren't accessible anywhere else during its lifetime.
900 // * Since the length of the new slice is smaller, it can't overflow beyond isize::MAX.
901 unsafe {
902 std::slice::from_raw_parts_mut(
903 slice.as_mut_ptr().cast(),
904 slice.len() / mem::size_of::<H::Block>(),
905 )
906 }
907}
908
909// Finalization padding that gets added to the end of data being hashed with sha1 or sha256
910// (the padding happens to be the same for both)
911fn sha_finalization_padding(data_length: usize) -> impl IntoIterator<Item = u8> {
912 iter::once(0x80)
913 .chain(iter::repeat(0).take((55 - data_length as isize).rem_euclid(64) as usize))
914 .chain(<[u8; 8]>::into_iter((data_length as u64 * 8).to_be_bytes()))
915}
916
917#[cfg(test)]
918mod tests;