git_internal/internal/pack/
cache_object.rs

1use std::fs::OpenOptions;
2use std::io::Write;
3use std::path::{Path, PathBuf};
4use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
5use std::{fs, io};
6use std::{ops::Deref, sync::Arc};
7
8use lru_mem::{HeapSize, MemSize};
9use serde::{Deserialize, Serialize};
10use threadpool::ThreadPool;
11
12use crate::internal::metadata::{EntryMeta, MetaAttached};
13use crate::internal::pack::entry::Entry;
14use crate::internal::pack::utils;
15use crate::{hash::SHA1, internal::object::types::ObjectType};
16
17// /// record heap-size of all CacheObjects, used for memory limit.
18// static CACHE_OBJS_MEM_SIZE: AtomicUsize = AtomicUsize::new(0);
19
20/// file load&store trait
21pub trait FileLoadStore: Serialize + for<'a> Deserialize<'a> {
22    fn f_load(path: &Path) -> Result<Self, io::Error>;
23    fn f_save(&self, path: &Path) -> Result<(), io::Error>;
24}
25
26// trait alias, so that impl FileLoadStore == impl Serialize + Deserialize
27impl<T: Serialize + for<'a> Deserialize<'a>> FileLoadStore for T {
28    fn f_load(path: &Path) -> Result<T, io::Error> {
29        let data = fs::read(path)?;
30        let obj: T = bincode::serde::decode_from_slice(&data, bincode::config::standard())
31            .map_err(io::Error::other)?
32            .0;
33        Ok(obj)
34    }
35    fn f_save(&self, path: &Path) -> Result<(), io::Error> {
36        if path.exists() {
37            return Ok(());
38        }
39        let data = bincode::serde::encode_to_vec(self, bincode::config::standard()).unwrap();
40        let path = path.with_extension("temp");
41        {
42            let mut file = OpenOptions::new()
43                .write(true)
44                .create_new(true)
45                .open(path.clone())?;
46            file.write_all(&data)?;
47        }
48        let final_path = path.with_extension("");
49        fs::rename(&path, final_path.clone())?;
50        Ok(())
51    }
52}
53
54/// Represents the metadata of a cache object, indicating whether it is a delta or not.
55#[derive(PartialEq, Eq, Clone, Debug, Serialize, Deserialize)]
56pub(crate) enum CacheObjectInfo {
57    /// The object is one of the four basic types:
58    /// [`ObjectType::Blob`], [`ObjectType::Tree`], [`ObjectType::Commit`], or [`ObjectType::Tag`].
59    /// The metadata contains the [`ObjectType`] and the [`SHA1`] hash of the object.
60    BaseObject(ObjectType, SHA1),
61    /// The object is an offset delta with a specified offset delta [`usize`],
62    /// and the size of the expanded object (previously `delta_final_size`).
63    OffsetDelta(usize, usize),
64    /// Similar to [`OffsetDelta`], but delta algorithm is `zstd`.
65    OffsetZstdelta(usize, usize),
66    /// The object is a hash delta with a specified [`SHA1`] hash,
67    /// and the size of the expanded object (previously `delta_final_size`).
68    HashDelta(SHA1, usize),
69}
70
71impl CacheObjectInfo {
72    /// Get the [`ObjectType`] of the object.
73    pub(crate) fn object_type(&self) -> ObjectType {
74        match self {
75            CacheObjectInfo::BaseObject(obj_type, _) => *obj_type,
76            CacheObjectInfo::OffsetDelta(_, _) => ObjectType::OffsetDelta,
77            CacheObjectInfo::OffsetZstdelta(_, _) => ObjectType::OffsetZstdelta,
78            CacheObjectInfo::HashDelta(_, _) => ObjectType::HashDelta,
79        }
80    }
81}
82
83#[derive(Debug, Serialize, Deserialize)]
84pub struct CacheObject {
85    pub(crate) info: CacheObjectInfo,
86    pub offset: usize,
87    pub data_decompressed: Vec<u8>,
88    pub mem_recorder: Option<Arc<AtomicUsize>>, // record mem-size of all CacheObjects of a Pack
89    pub is_delta_in_pack: bool,
90}
91
92impl Clone for CacheObject {
93    fn clone(&self) -> Self {
94        let obj = CacheObject {
95            info: self.info.clone(),
96            offset: self.offset,
97            data_decompressed: self.data_decompressed.clone(),
98            mem_recorder: self.mem_recorder.clone(),
99            is_delta_in_pack: self.is_delta_in_pack,
100        };
101        obj.record_mem_size();
102        obj
103    }
104}
105
106// ! used by lru_mem to calculate the size of the object, limit the memory usage.
107// ! the implementation of HeapSize is not accurate, only calculate the size of the data_decompress
108// Note that: mem_size == value_size + heap_size, and we only need to impl HeapSize because value_size is known
109impl HeapSize for CacheObject {
110    /// If a [`CacheObject`] is [`ObjectType::HashDelta`] or [`ObjectType::OffsetDelta`],
111    /// it will expand to another [`CacheObject`] of other types. To prevent potential OOM,
112    /// we record the size of the expanded object as well as that of the object itself.
113    ///
114    /// Base objects, *i.e.*, [`ObjectType::Blob`], [`ObjectType::Tree`], [`ObjectType::Commit`],
115    /// and [`ObjectType::Tag`], will not be expanded, so the heap-size of the object is the same
116    /// as the size of the data.
117    ///
118    /// See [Comment in PR #755](https://github.com/web3infra-foundation/mega/pull/755#issuecomment-2543100481) for more details.
119    fn heap_size(&self) -> usize {
120        match &self.info {
121            CacheObjectInfo::BaseObject(_, _) => self.data_decompressed.heap_size(),
122            CacheObjectInfo::OffsetDelta(_, delta_final_size)
123            | CacheObjectInfo::OffsetZstdelta(_, delta_final_size)
124            | CacheObjectInfo::HashDelta(_, delta_final_size) => {
125                // To those who are concerned about why these two values are added,
126                // let's consider the lifetime of two `CacheObject`s, say `delta_obj`
127                // and `final_obj` in the function `Pack::rebuild_delta`.
128                //
129                // `delta_obj` is dropped only after `Pack::rebuild_delta` returns,
130                // but the space for `final_obj` is allocated in that function.
131                //
132                // Therefore, during the execution of `Pack::rebuild_delta`, both `delta_obj`
133                // and `final_obj` coexist. The maximum memory usage is the sum of the memory
134                // usage of `delta_obj` and `final_obj`.
135                self.data_decompressed.heap_size() + delta_final_size
136            }
137        }
138    }
139}
140
141impl Drop for CacheObject {
142    // Check: the heap-size subtracted when Drop is equal to the heap-size recorded
143    // (cannot change the heap-size during life cycle)
144    fn drop(&mut self) {
145        // (&*self).heap_size() != self.heap_size()
146        if let Some(mem_recorder) = &self.mem_recorder {
147            mem_recorder.fetch_sub((*self).mem_size(), Ordering::Release);
148        }
149    }
150}
151
152/// Heap-size recorder for a class(struct)
153/// <br> You should use a static Var to record mem-size
154/// and record mem-size after construction & minus it in `drop()`
155/// <br> So, variable-size fields in object should NOT be modified to keep heap-size stable.
156/// <br> Or, you can record the initial mem-size in this object
157/// <br> Or, update it (not impl)
158pub trait MemSizeRecorder: MemSize {
159    fn record_mem_size(&self);
160    fn set_mem_recorder(&mut self, mem_size: Arc<AtomicUsize>);
161    // fn get_mem_size() -> usize;
162}
163
164impl MemSizeRecorder for CacheObject {
165    /// record the mem-size of this `CacheObj` in a `static` `var`
166    /// <br> since that, DO NOT modify `CacheObj` after recording
167    fn record_mem_size(&self) {
168        if let Some(mem_recorder) = &self.mem_recorder {
169            mem_recorder.fetch_add(self.mem_size(), Ordering::Release);
170        }
171    }
172
173    fn set_mem_recorder(&mut self, mem_recorder: Arc<AtomicUsize>) {
174        self.mem_recorder = Some(mem_recorder);
175    }
176
177    // fn get_mem_size() -> usize {
178    //     CACHE_OBJS_MEM_SIZE.load(Ordering::Acquire)
179    // }
180}
181
182impl CacheObject {
183    /// Create a new CacheObject which is neither [`ObjectType::OffsetDelta`] nor [`ObjectType::HashDelta`].
184    pub fn new_for_undeltified(obj_type: ObjectType, data: Vec<u8>, offset: usize) -> Self {
185        let hash = utils::calculate_object_hash(obj_type, &data);
186        CacheObject {
187            info: CacheObjectInfo::BaseObject(obj_type, hash),
188            offset,
189            data_decompressed: data,
190            mem_recorder: None,
191            is_delta_in_pack: false,
192        }
193    }
194
195    /// Get the [`ObjectType`] of the object.
196    pub fn object_type(&self) -> ObjectType {
197        self.info.object_type()
198    }
199
200    /// Get the [`SHA1`] hash of the object.
201    ///
202    /// If the object is a delta object, return [`None`].
203    pub fn base_object_hash(&self) -> Option<SHA1> {
204        match &self.info {
205            CacheObjectInfo::BaseObject(_, hash) => Some(*hash),
206            _ => None,
207        }
208    }
209
210    /// Get the offset delta of the object.
211    ///
212    /// If the object is not an offset delta, return [`None`].
213    pub fn offset_delta(&self) -> Option<usize> {
214        match &self.info {
215            CacheObjectInfo::OffsetDelta(offset, _) => Some(*offset),
216            _ => None,
217        }
218    }
219
220    /// Get the hash delta of the object.
221    ///
222    /// If the object is not a hash delta, return [`None`].
223    pub fn hash_delta(&self) -> Option<SHA1> {
224        match &self.info {
225            CacheObjectInfo::HashDelta(hash, _) => Some(*hash),
226            _ => None,
227        }
228    }
229
230    /// transform the CacheObject to Entry
231    pub fn to_entry(&self) -> Entry {
232        match self.info {
233            CacheObjectInfo::BaseObject(obj_type, hash) => Entry {
234                obj_type,
235                data: self.data_decompressed.clone(),
236                hash,
237                chain_len: 0,
238            },
239            _ => {
240                unreachable!("delta object should not persist!")
241            }
242        }
243    }
244
245    pub fn to_entry_metadata(&self) -> MetaAttached<Entry, EntryMeta> {
246        match self.info {
247            CacheObjectInfo::BaseObject(obj_type, hash) => {
248                let entry = Entry {
249                    obj_type,
250                    data: self.data_decompressed.clone(),
251                    hash,
252                    chain_len: 0,
253                };
254                let meta = EntryMeta {
255                    // pack_id:Some(pack_id),
256                    pack_offset: Some(self.offset),
257                    is_delta: Some(self.is_delta_in_pack),
258                    ..Default::default()
259                };
260                MetaAttached { inner: entry, meta }
261            }
262
263            _ => {
264                unreachable!("delta object should not persist!")
265            }
266        }
267    }
268}
269
270/// trait alias for simple use
271pub trait ArcWrapperBounds:
272    HeapSize + Serialize + for<'a> Deserialize<'a> + Send + Sync + 'static
273{
274}
275// You must impl `Alias Trait` for all the `T` satisfying Constraints
276// Or, `T` will not satisfy `Alias Trait` even if it satisfies the Original traits
277impl<T: HeapSize + Serialize + for<'a> Deserialize<'a> + Send + Sync + 'static> ArcWrapperBounds
278    for T
279{
280}
281
282/// Implementing encapsulation of Arc to enable third-party Trait HeapSize implementation for the Arc type
283/// Because of use Arc in LruCache, the LruCache is not clear whether a pointer will drop the referenced
284/// content when it is ejected from the cache, the actual memory usage is not accurate
285pub struct ArcWrapper<T: ArcWrapperBounds> {
286    pub data: Arc<T>,
287    complete_signal: Arc<AtomicBool>,
288    pool: Option<Arc<ThreadPool>>,
289    pub store_path: Option<PathBuf>, // path to store when drop
290}
291impl<T: ArcWrapperBounds> ArcWrapper<T> {
292    /// Create a new ArcWrapper
293    pub fn new(data: Arc<T>, share_flag: Arc<AtomicBool>, pool: Option<Arc<ThreadPool>>) -> Self {
294        ArcWrapper {
295            data,
296            complete_signal: share_flag,
297            pool,
298            store_path: None,
299        }
300    }
301    pub fn set_store_path(&mut self, path: PathBuf) {
302        self.store_path = Some(path);
303    }
304}
305
306impl<T: ArcWrapperBounds> HeapSize for ArcWrapper<T> {
307    fn heap_size(&self) -> usize {
308        self.data.heap_size()
309    }
310}
311
312impl<T: ArcWrapperBounds> Clone for ArcWrapper<T> {
313    /// clone won't clone the store_path
314    fn clone(&self) -> Self {
315        ArcWrapper {
316            data: self.data.clone(),
317            complete_signal: self.complete_signal.clone(),
318            pool: self.pool.clone(),
319            store_path: None,
320        }
321    }
322}
323
324impl<T: ArcWrapperBounds> Deref for ArcWrapper<T> {
325    type Target = Arc<T>;
326    fn deref(&self) -> &Self::Target {
327        &self.data
328    }
329}
330impl<T: ArcWrapperBounds> Drop for ArcWrapper<T> {
331    // `drop` will be called in `lru_cache.insert()` when cache full & eject the LRU
332    // `lru_cache.insert()` is protected by Mutex
333    fn drop(&mut self) {
334        if !self.complete_signal.load(Ordering::Acquire)
335            && let Some(path) = &self.store_path
336        {
337            match &self.pool {
338                Some(pool) => {
339                    let data_copy = self.data.clone();
340                    let path_copy = path.clone();
341                    let complete_signal = self.complete_signal.clone();
342                    // block entire process, wait for IO, Control Memory
343                    // queue size will influence the Memory usage
344                    while pool.queued_count() > 2000 {
345                        std::thread::yield_now();
346                    }
347                    pool.execute(move || {
348                        if !complete_signal.load(Ordering::Acquire) {
349                            let res = data_copy.f_save(&path_copy);
350                            if let Err(e) = res {
351                                println!("[f_save] {path_copy:?} error: {e:?}");
352                            }
353                        }
354                    });
355                }
356                None => {
357                    let res = self.data.f_save(path);
358                    if let Err(e) = res {
359                        println!("[f_save] {path:?} error: {e:?}");
360                    }
361                }
362            }
363        }
364    }
365}
366#[cfg(test)]
367mod test {
368    use std::{fs, sync::Mutex};
369
370    use lru_mem::LruCache;
371
372    use super::*;
373
374    #[test]
375    fn test_heap_size_record() {
376        let mut obj = CacheObject {
377            info: CacheObjectInfo::BaseObject(ObjectType::Blob, SHA1::default()),
378            offset: 0,
379            data_decompressed: vec![0; 1024],
380            mem_recorder: None,
381            is_delta_in_pack: false,
382        };
383        let mem = Arc::new(AtomicUsize::default());
384        assert_eq!(mem.load(Ordering::Relaxed), 0);
385        obj.set_mem_recorder(mem.clone());
386        obj.record_mem_size();
387        assert_eq!(mem.load(Ordering::Relaxed), obj.mem_size());
388        drop(obj);
389        assert_eq!(mem.load(Ordering::Relaxed), 0);
390    }
391
392    #[test]
393    fn test_cache_object_with_same_size() {
394        let a = CacheObject {
395            info: CacheObjectInfo::BaseObject(ObjectType::Blob, SHA1::default()),
396            offset: 0,
397            data_decompressed: vec![0; 1024],
398            mem_recorder: None,
399            is_delta_in_pack: false,
400        };
401        assert!(a.heap_size() == 1024);
402
403        // let b = ArcWrapper(Arc::new(a.clone()));
404        let b = ArcWrapper::new(Arc::new(a.clone()), Arc::new(AtomicBool::new(false)), None);
405        assert!(b.heap_size() == 1024);
406    }
407
408    #[test]
409    fn test_cache_object_with_lru() {
410        let mut cache = LruCache::new(2048);
411
412        let hash_a = SHA1::default();
413        let hash_b = SHA1::new(b"b"); // whatever different hash
414        let a = CacheObject {
415            info: CacheObjectInfo::BaseObject(ObjectType::Blob, hash_a),
416            offset: 0,
417            data_decompressed: vec![0; 1024],
418            mem_recorder: None,
419            is_delta_in_pack: false,
420        };
421        println!("a.heap_size() = {}", a.heap_size());
422
423        let b = CacheObject {
424            info: CacheObjectInfo::BaseObject(ObjectType::Blob, hash_b),
425            offset: 0,
426            data_decompressed: vec![0; (1024.0 * 1.5) as usize],
427            mem_recorder: None,
428            is_delta_in_pack: false,
429        };
430        {
431            let r = cache.insert(
432                hash_a.to_string(),
433                ArcWrapper::new(Arc::new(a.clone()), Arc::new(AtomicBool::new(true)), None),
434            );
435            assert!(r.is_ok())
436        }
437        {
438            let r = cache.try_insert(
439                hash_b.to_string(),
440                ArcWrapper::new(Arc::new(b.clone()), Arc::new(AtomicBool::new(true)), None),
441            );
442            assert!(r.is_err());
443            if let Err(lru_mem::TryInsertError::WouldEjectLru { .. }) = r {
444                // 匹配到指定错误,不需要额外操作
445            } else {
446                panic!("Expected WouldEjectLru error");
447            }
448            // 使用不同的键插入b,这样a会被驱逐
449            let r = cache.insert(
450                hash_b.to_string(),
451                ArcWrapper::new(Arc::new(b.clone()), Arc::new(AtomicBool::new(true)), None),
452            );
453            assert!(r.is_ok());
454        }
455        {
456            // a should be ejected
457            let r = cache.get(&hash_a.to_string());
458            assert!(r.is_none());
459        }
460    }
461
462    #[derive(Serialize, Deserialize)]
463    struct Test {
464        a: usize,
465    }
466    impl Drop for Test {
467        fn drop(&mut self) {
468            println!("drop Test");
469        }
470    }
471    impl HeapSize for Test {
472        fn heap_size(&self) -> usize {
473            self.a
474        }
475    }
476    #[test]
477    fn test_lru_drop() {
478        println!("insert a");
479        let cache = LruCache::new(2048);
480        let cache = Arc::new(Mutex::new(cache));
481        {
482            let mut c = cache.as_ref().lock().unwrap();
483            let _ = c.insert(
484                "a",
485                ArcWrapper::new(
486                    Arc::new(Test { a: 1024 }),
487                    Arc::new(AtomicBool::new(true)),
488                    None,
489                ),
490            );
491        }
492        println!("insert b, a should be ejected");
493        {
494            let mut c = cache.as_ref().lock().unwrap();
495            let _ = c.insert(
496                "b",
497                ArcWrapper::new(
498                    Arc::new(Test { a: 1200 }),
499                    Arc::new(AtomicBool::new(true)),
500                    None,
501                ),
502            );
503        }
504        let b = {
505            let mut c = cache.as_ref().lock().unwrap();
506            c.get("b").cloned()
507        };
508        println!("insert c, b should not be ejected");
509        {
510            let mut c = cache.as_ref().lock().unwrap();
511            let _ = c.insert(
512                "c",
513                ArcWrapper::new(
514                    Arc::new(Test { a: 1200 }),
515                    Arc::new(AtomicBool::new(true)),
516                    None,
517                ),
518            );
519        }
520        println!("user b: {}", b.as_ref().unwrap().a);
521        println!("test over, enject all");
522    }
523
524    #[test]
525    fn test_cache_object_serialize() {
526        let a = CacheObject {
527            info: CacheObjectInfo::BaseObject(ObjectType::Blob, SHA1::default()),
528            offset: 0,
529            data_decompressed: vec![0; 1024],
530            mem_recorder: None,
531            is_delta_in_pack: false,
532        };
533        let s = bincode::serde::encode_to_vec(&a, bincode::config::standard()).unwrap();
534        let b: CacheObject = bincode::serde::decode_from_slice(&s, bincode::config::standard())
535            .unwrap()
536            .0;
537        assert_eq!(a.info, b.info);
538        assert_eq!(a.data_decompressed, b.data_decompressed);
539        assert_eq!(a.offset, b.offset);
540    }
541
542    #[test]
543    fn test_arc_wrapper_drop_store() {
544        let mut path = PathBuf::from(".cache_temp/test_arc_wrapper_drop_store");
545        fs::create_dir_all(&path).unwrap();
546        path.push("test_obj");
547        let mut a = ArcWrapper::new(Arc::new(1024), Arc::new(AtomicBool::new(false)), None);
548        a.set_store_path(path.clone());
549        drop(a);
550
551        assert!(path.exists());
552        path.pop();
553        fs::remove_dir_all(path).unwrap();
554    }
555
556    #[test]
557    /// test warpper can't correctly store the data when lru eject it
558    fn test_arc_wrapper_with_lru() {
559        let mut cache = LruCache::new(1500);
560        let path = PathBuf::from(".cache_temp/test_arc_wrapper_with_lru");
561        let _ = fs::remove_dir_all(&path);
562        fs::create_dir_all(&path).unwrap();
563        let shared_flag = Arc::new(AtomicBool::new(false));
564
565        // insert a, a not ejected
566        let a_path = path.join("a");
567        {
568            let mut a = ArcWrapper::new(Arc::new(Test { a: 1024 }), shared_flag.clone(), None);
569            a.set_store_path(a_path.clone());
570            let b = ArcWrapper::new(Arc::new(1024), shared_flag.clone(), None);
571            assert!(b.store_path.is_none());
572
573            println!("insert a with heap size: {:?}", a.heap_size());
574            let rt = cache.insert("a", a);
575            if let Err(e) = rt {
576                panic!("{}", format!("insert a failed: {:?}", e.to_string()));
577            }
578            println!("after insert a, cache used = {}", cache.current_size());
579        }
580        assert!(!a_path.exists());
581
582        let b_path = path.join("b");
583        // insert b, a should be ejected
584        {
585            let mut b = ArcWrapper::new(Arc::new(Test { a: 996 }), shared_flag.clone(), None);
586            b.set_store_path(b_path.clone());
587            let rt = cache.insert("b", b);
588            if let Err(e) = rt {
589                panic!("{}", format!("insert a failed: {:?}", e.to_string()));
590            }
591            println!("after insert b, cache used = {}", cache.current_size());
592        }
593        assert!(a_path.exists());
594        assert!(!b_path.exists());
595        shared_flag.store(true, Ordering::Release);
596        fs::remove_dir_all(path).unwrap();
597        // should pass even b's path not exists
598    }
599}