hg_parser/
lib.rs

1#![doc = include_str!("../README.md")]
2use std::{
3    collections::{BTreeMap, HashMap, HashSet},
4    fs::File,
5    io::{BufRead, BufReader, Read},
6    ops::Deref,
7    path::{Path, PathBuf},
8    sync::{Arc, Mutex},
9};
10
11use lru_cache::LruCache;
12use ordered_parallel_iterator::OrderedParallelIterator;
13use rayon::prelude::*;
14
15mod cache;
16mod changeset;
17mod error;
18mod manifest;
19mod parser;
20mod path;
21mod revisionlog;
22mod types;
23
24use cache::{Cachable, Cache};
25use manifest::Manifest;
26use path::{fncache_fsencode, simple_fsencode, MPath, MPathElement};
27use revisionlog::RevisionLog;
28use types::{MercurialTag, NodeHash, RepositoryRequire};
29
30pub use changeset::*;
31pub use error::ErrorKind;
32pub use manifest::{FileType, ManifestEntry, ManifestEntryDetails};
33pub use types::{Revision, RevisionRange};
34
35/// Options for mercurial repository.
36#[derive(Default)]
37pub struct MercurialRepositoryOptions {
38    /// Threat unknown requirements as warnings.
39    pub ignore_unknown_requirements: bool,
40}
41
42#[derive(Debug)]
43/// Mercurial repository. Top-level structure for access to change sets and tags.
44pub struct MercurialRepository {
45    root_path: PathBuf,
46    changelog: RevisionLog,
47    manifest: RevisionLog,
48    requires: HashSet<RepositoryRequire>,
49}
50
51impl MercurialRepository {
52    /// Opens `MercurialRepository` at `root_path`.
53    pub fn open<P: AsRef<Path>>(root_path: P) -> Result<MercurialRepository, ErrorKind> {
54        Self::open_with_options(root_path, Default::default())
55    }
56
57    /// Opens `MercurialRepository` at `root_path` with options.
58    pub fn open_with_options<P: AsRef<Path>>(
59        root_path: P,
60        options: MercurialRepositoryOptions,
61    ) -> Result<MercurialRepository, ErrorKind> {
62        let base = root_path.as_ref().join(".hg");
63
64        let requires = MercurialRepository::load_requires(&base, &options)?;
65
66        let store = base.join("store");
67
68        let changelog = RevisionLog::init(store.join("00changelog.i"), None)?;
69        let manifest = RevisionLog::init(store.join("00manifest.i"), None)?;
70
71        Ok(MercurialRepository {
72            root_path: root_path.as_ref().into(),
73            changelog,
74            manifest,
75            requires,
76        })
77    }
78
79    /// Last `Revision` in revision log.
80    pub fn last_rev(&self) -> Revision {
81        self.changelog.last_rev()
82    }
83
84    /// Changeset iterator other all revisions in log.
85    pub fn iter(&self) -> ChangesetIter {
86        self.into_iter()
87    }
88
89    /// Changeset header iterator other all revisions in log.
90    pub fn header_iter(&self) -> ChangesetHeaderIter {
91        self.range_header_iter(Revision::from(0).range_to(self.last_rev()))
92    }
93
94    /// Changeset iterator other range of revisions in log.
95    pub fn range_iter<RR: Into<RevisionRange>>(&self, revisions_range: RR) -> ChangesetIter {
96        ChangesetIter {
97            repository: self,
98            revisions_range: revisions_range.into(),
99            heads: Mutex::new(LruCache::new(1 << 4)),
100            files: Mutex::new(LruCache::new(1 << 12)),
101            cache: Cache::new(1 << 13),
102        }
103    }
104
105    /// Changeset header iterator other range of revisions in log.
106    pub fn range_header_iter<RR: Into<RevisionRange>>(
107        &self,
108        revisions_range: RR,
109    ) -> ChangesetHeaderIter {
110        ChangesetHeaderIter {
111            repository: self,
112            revisions_range: revisions_range.into(),
113            cache: Cache::new(1 << 13),
114        }
115    }
116
117    /// List tags in repository. Tags read from `.hg/cache/tags2-visible` or `.hgtags`.
118    pub fn tags(&self) -> Result<BTreeMap<Revision, MercurialTag>, ErrorKind> {
119        let mut tags_path = self
120            .root_path
121            .join(".hg")
122            .join("cache")
123            .join("tags2-visible");
124        if !tags_path.exists() {
125            tags_path = self.root_path.join(".hgtags");
126        }
127        let file = File::open(tags_path)?;
128
129        let mut names = HashMap::new();
130        for line in BufReader::new(file).lines() {
131            let tag: Result<MercurialTag, _> = line.unwrap().parse();
132            if let Ok(tag) = tag {
133                if let Some(rev) = self.changelog.info_revision_by_node(&tag.node).cloned() {
134                    names.insert(tag.name.clone(), (rev, tag));
135                } else {
136                    names.remove(&tag.name);
137                }
138            }
139        }
140        Ok(names.into_values().collect())
141    }
142
143    pub(crate) fn get_manifest(&self, revision: Revision, cache: &Cache) -> Manifest {
144        let data = self.changelog.get_revision(revision, cache).unwrap();
145        let mut lines = data.splitn(2, |&x| x == b'\n');
146        let manifestid: NodeHash = lines
147            .next()
148            .and_then(|x| std::str::from_utf8(x).ok())
149            .and_then(|x| x.parse().ok())
150            .unwrap();
151        self.manifest
152            .get_entry_by_nodeid(&manifestid)
153            .and_then(|x| self.manifest.get_revision_from_entry(x, cache).ok())
154            .map(Manifest::from)
155            .unwrap()
156    }
157
158    fn load_requires<P: AsRef<Path>>(
159        path: P,
160        options: &MercurialRepositoryOptions,
161    ) -> Result<HashSet<RepositoryRequire>, ErrorKind> {
162        let requires_path = path.as_ref().join("requires");
163        let file = File::open(requires_path)?;
164        let lines = BufReader::new(file).lines().map_while(Result::ok);
165        if options.ignore_unknown_requirements {
166            lines
167                .map(|x| match x.parse() {
168                    Err(ErrorKind::UnknownRequirement(r)) => Ok(r),
169                    other => other,
170                })
171                .collect()
172        } else {
173            Ok(lines
174                .map(|x| x.parse().expect("could not parse requirement"))
175                .collect())
176        }
177    }
178
179    fn fsencode_path(&self, elements: &[MPathElement]) -> PathBuf {
180        // Mercurial has a complicated logic of path encoding.
181        // Code below matches core Mercurial logic from the commit
182        // 75013952d8d9608f73cd45f68405fbd6ec112bf2 from file mercurial/store.py from the function
183        // store(). The only caveat is that basicstore is not yet implemented
184        if self.requires.contains(&RepositoryRequire::Store) {
185            if self.requires.contains(&RepositoryRequire::FnCache) {
186                let dotencode = self.requires.contains(&RepositoryRequire::DotEncode);
187                fncache_fsencode(elements, dotencode)
188            } else {
189                simple_fsencode(elements)
190            }
191        } else {
192            unimplemented!();
193        }
194    }
195
196    fn changeset_header(&self, cache: &Cache, revision: Revision) -> Option<ChangesetHeader> {
197        self.changelog.get_entry_by_revision(revision).map(|entry| {
198            let data = self
199                .changelog
200                .get_revision_from_entry(entry, cache)
201                .unwrap_or_else(|_| {
202                    panic!(
203                        "cannot get revision {:?} from changelog of {:?}",
204                        revision, &self.root_path
205                    )
206                });
207            ChangesetHeader::from_entry_bytes(entry, &data).unwrap()
208        })
209    }
210
211    fn changeset(
212        &self,
213        heads: &Mutex<LruCache<Revision, Arc<Manifest>>>,
214        files_cache: &Mutex<LruCache<Vec<u8>, Arc<RevisionLog>>>,
215        cache: &Cache,
216        revision: Revision,
217    ) -> Option<Changeset> {
218        if let Some(entry) = self.changelog.get_entry_by_revision(revision) {
219            // we have entry - need to build revision and put it to heads
220
221            let path = &self.root_path;
222            let data = self
223                .changelog
224                .get_revision_from_entry(entry, cache)
225                .unwrap_or_else(|_| {
226                    panic!(
227                        "cannot get revision {:?} from changelog of {:?}",
228                        revision, path
229                    )
230                });
231            let changeset_header = ChangesetHeader::from_entry_bytes(entry, &data).unwrap();
232            if let Some(manifest_entry) = self
233                .manifest
234                .get_entry_by_nodeid(&changeset_header.manifestid)
235                .or_else(|| self.manifest.get_entry_by_revision(revision))
236            {
237                let data = self
238                    .manifest
239                    .get_revision_from_entry(manifest_entry, cache)
240                    .unwrap_or_else(|_| {
241                        panic!(
242                            "cannot get revision {:?} from manifest of {:?}",
243                            revision, path
244                        )
245                    });
246                let manifest = Manifest::from(data);
247
248                let mut files = Vec::with_capacity(manifest.files.len() * 2);
249                let files = if let (Some(p1), Some(p2)) = (changeset_header.p1, changeset_header.p2)
250                {
251                    let mut heads = heads.lock().unwrap();
252                    if !heads.contains_key(&p1) {
253                        heads.insert(p1, Arc::new(self.get_manifest(p1, cache)));
254                    }
255                    if !heads.contains_key(&p2) {
256                        heads.insert(p2, Arc::new(self.get_manifest(p2, cache)));
257                    }
258
259                    let p1 = heads.get_mut(&p1).cloned().unwrap();
260                    let p2 = heads.get_mut(&p2).cloned().unwrap();
261
262                    split_dict(&manifest, &p1, &mut files);
263                    split_dict(&manifest, &p2, &mut files);
264
265                    files.sort();
266                    files.dedup();
267
268                    &files
269                } else {
270                    &changeset_header.files
271                };
272
273                let files: Vec<_> = files
274                    .par_iter()
275                    .map(|file| {
276                        let file = file.as_slice();
277                        let manifest_entry = manifest.files.get(file);
278                        let data = manifest_entry.and_then(|manifest_entry| {
279                            Self::file_revlog(self, files_cache, file)
280                                .get_revision_by_nodeid(&manifest_entry.id, cache)
281                        });
282
283                        ChangesetFile {
284                            path: file.into(),
285                            data,
286                            manifest_entry: manifest_entry.cloned(),
287                        }
288                    })
289                    .collect();
290                heads
291                    .lock()
292                    .as_mut()
293                    .map(|x| {
294                        changeset_header.p1.map(|h1| x.remove(&h1));
295                        changeset_header.p2.map(|h2| x.remove(&h2));
296                        x.insert(revision, Arc::new(manifest));
297                    })
298                    .unwrap();
299                let changeset = Changeset {
300                    revision,
301                    header: changeset_header,
302                    files,
303                };
304                Some(changeset)
305            } else {
306                None
307            }
308        } else {
309            // revision does not exist - stop iterator
310            None
311        }
312    }
313
314    fn file_revlog(
315        repository: &MercurialRepository,
316        files: &Mutex<LruCache<Vec<u8>, Arc<RevisionLog>>>,
317        file: &[u8],
318    ) -> Arc<RevisionLog> {
319        let mut file_revlog = files.lock().unwrap().get_mut(file).cloned();
320
321        if file_revlog.is_none() {
322            let filerevlog = Arc::new(Self::init_file_revlog(repository, file));
323            files
324                .lock()
325                .unwrap()
326                .insert(file.into(), filerevlog.clone());
327            assert!(files.lock().unwrap().get_mut(file).is_some());
328            file_revlog = Some(filerevlog);
329        }
330
331        file_revlog.unwrap()
332    }
333
334    fn init_file_revlog(repository: &MercurialRepository, file: &[u8]) -> RevisionLog {
335        let root_path = &repository.root_path;
336        let path = MPath::from(file);
337        let path = MPath::new("data")
338            .unwrap()
339            .join(MPath::iter_opt(Some(&path)));
340
341        let mut elements: Vec<MPathElement> = path.into_iter().collect();
342        let mut basename = elements.pop().unwrap();
343
344        let index_path = {
345            let mut basename = basename.clone();
346            basename.extend(b".i");
347            elements.push(basename);
348            repository.fsencode_path(&elements)
349        };
350        elements.pop();
351
352        let data_path = {
353            basename.extend(b".d");
354            elements.push(basename);
355            repository.fsencode_path(&elements)
356        };
357
358        let store = root_path.join(".hg").join("store");
359        match RevisionLog::init(store.join(index_path), Some(store.join(data_path))) {
360            Err(ErrorKind::InvalidPath(info)) => Err(ErrorKind::InvalidPath(format!(
361                "Cannot load revision log for {:?}: {}",
362                std::str::from_utf8(file),
363                info
364            ))),
365            other => other,
366        }
367        .unwrap()
368    }
369}
370
371impl<'a> IntoIterator for &'a MercurialRepository {
372    type Item = Changeset;
373    type IntoIter = ChangesetIter<'a>;
374
375    fn into_iter(self) -> Self::IntoIter {
376        self.range_iter(Revision::from(0).range_to(self.last_rev()))
377    }
378}
379
380/// Cached version of `MercurialRepository`.
381pub struct CachedMercurialRepository {
382    repository: MercurialRepository,
383    heads: Mutex<LruCache<Revision, Arc<Manifest>>>,
384    files: Mutex<LruCache<Vec<u8>, Arc<RevisionLog>>>,
385    cache: Cache,
386}
387
388impl From<MercurialRepository> for CachedMercurialRepository {
389    fn from(repository: MercurialRepository) -> Self {
390        Self {
391            repository,
392            heads: Mutex::new(LruCache::new(1 << 4)),
393            files: Mutex::new(LruCache::new(1 << 12)),
394            cache: Cache::new(1 << 13),
395        }
396    }
397}
398
399/// Shares instance of `CachedMercurialRepository` between multiple readers.
400pub struct SharedMercurialRepository {
401    inner: Arc<CachedMercurialRepository>,
402}
403
404impl SharedMercurialRepository {
405    pub fn new(repository: MercurialRepository) -> Self {
406        Self {
407            inner: Arc::new(repository.into()),
408        }
409    }
410}
411
412impl Deref for SharedMercurialRepository {
413    type Target = MercurialRepository;
414
415    #[inline]
416    fn deref(&self) -> &MercurialRepository {
417        &self.inner.repository
418    }
419}
420
421impl SharedMercurialRepository {
422    pub fn par_range_iter(
423        &self,
424        revision_range: RevisionRange,
425    ) -> OrderedParallelIterator<Changeset> {
426        let cached_repository = self.inner.clone();
427        let xform_ctor = move || {
428            let cached_repository = cached_repository.clone();
429            move |x: Revision| {
430                let repository = &cached_repository.repository;
431                repository
432                    .changeset(
433                        &cached_repository.heads,
434                        &cached_repository.files,
435                        &cached_repository.cache,
436                        x,
437                    )
438                    .unwrap()
439            }
440        };
441        OrderedParallelIterator::new(move || revision_range, xform_ctor)
442    }
443}
444
445/// Iterator over `MercurialRepository` revisions.
446pub struct ChangesetIter<'a> {
447    repository: &'a MercurialRepository,
448    revisions_range: RevisionRange,
449    heads: Mutex<LruCache<Revision, Arc<Manifest>>>,
450    files: Mutex<LruCache<Vec<u8>, Arc<RevisionLog>>>,
451    cache: Cache,
452}
453
454impl<'a> Iterator for ChangesetIter<'a> {
455    type Item = Changeset;
456
457    fn next(&mut self) -> Option<Self::Item> {
458        self.revisions_range.next().and_then(|revision| {
459            self.repository
460                .changeset(&self.heads, &self.files, &self.cache, revision)
461        })
462    }
463}
464
465pub struct ChangesetHeaderIter<'a> {
466    repository: &'a MercurialRepository,
467    revisions_range: RevisionRange,
468    cache: Cache,
469}
470
471impl<'a> Iterator for ChangesetHeaderIter<'a> {
472    type Item = ChangesetHeader;
473
474    fn next(&mut self) -> Option<Self::Item> {
475        self.revisions_range
476            .next()
477            .and_then(|revision| self.repository.changeset_header(&self.cache, revision))
478    }
479}
480
481fn load_to_vec<P: AsRef<Path>>(path: P) -> Result<Vec<u8>, ErrorKind> {
482    let mut f = match File::open(path.as_ref()) {
483        Ok(f) => f,
484        Err(err) => {
485            return Err(ErrorKind::InvalidPath(format!(
486                "Cannot open {:?}: {:?}",
487                path.as_ref(),
488                err
489            )));
490        }
491    };
492    let mut result = vec![];
493    f.read_to_end(&mut result).unwrap();
494    Ok(result)
495}
496
497/// Extract blob data (raw file content) from internal Mercurial representation.
498/// This representation is by default returned by [ChangesetIter](struct.ChangesetIter.html) iterator.
499/// ```
500/// # use hg_parser::file_content;
501/// let blob_with_meta = b"\x01\nmeta information\x01\nraw body";
502///
503/// let blob = file_content(blob_with_meta);
504///
505/// assert_eq!(blob, b"raw body");
506///
507/// assert_eq!(b"without meta", file_content(b"without meta"));
508/// ```
509pub fn file_content(data: &[u8]) -> &[u8] {
510    let (_, off) = extract_meta(data);
511    &data[off..]
512}
513
514const META_MARKER: &[u8] = b"\x01\n";
515const META_SZ: usize = 2;
516
517fn extract_meta(file: &[u8]) -> (&[u8], usize) {
518    if file.len() < META_SZ {
519        return (&[], 0);
520    }
521    if &file[..META_SZ] != META_MARKER {
522        (&[], 0)
523    } else {
524        let metasz = &file[META_SZ..]
525            .windows(2)
526            .enumerate()
527            .find(|&(_, sample)| sample == META_MARKER)
528            .map(|(idx, _)| idx + META_SZ * 2)
529            .unwrap_or(META_SZ); // XXX malformed if None - unterminated metadata
530
531        let metasz = *metasz;
532        if metasz >= META_SZ * 2 {
533            (&file[META_SZ..metasz - META_SZ], metasz)
534        } else {
535            (&[], metasz)
536        }
537    }
538}
539
540fn split_dict(dleft: &Manifest, dright: &Manifest, f: &mut Vec<Vec<u8>>) {
541    for (left, linfo) in &dleft.files {
542        let right = dright.files.get(left);
543        if right.is_none() || right.unwrap() != linfo {
544            f.push(left.clone());
545        }
546    }
547
548    for right in dright.files.keys() {
549        let left = dleft.files.get(right);
550        if left.is_none() {
551            f.push(right.clone());
552        }
553    }
554}