Skip to main content

scrump_format_hprof/
lib.rs

1//! Java HPROF heap-dump handler.
2//!
3//! HPROF format (per the Eclipse Memory Analyzer reference and the
4//! hprof_b_spec.h header that ships with the JDK):
5//!
6//!   File header (variable length, big-endian):
7//!     - NUL-terminated ASCII format string, e.g. `JAVA PROFILE 1.0.2`
8//!     - `u32 id_size`                       (size of object IDs in bytes)
9//!     - `u32 timestamp_hi`
10//!     - `u32 timestamp_lo`
11//!
12//!   Repeated thereafter, one record per iteration (big-endian fields):
13//!     - `u8  tag`
14//!     - `u32 ts_delta`     (micros since file start)
15//!     - `u32 length`
16//!     - `u8[length]` body
17//!
18//! We emit each record body as a separate [`Chunk`], with an
19//! appropriate origin label per known tag. Special-case tag 0x01
20//! (UTF8 STRING) which has the layout `id (id_size bytes) +
21//! utf8 string` — we surface the utf8 portion as
22//! `ChunkOrigin::StringTable("hprof.utf8")` so the engine concentrates
23//! on what's almost certainly a leak vector.
24//!
25//! Redaction is byte-level zero-fill at absolute file offsets. The
26//! header, every record's tag/length triplet, and the segment structure
27//! of HEAP DUMP SEGMENT records remain untouched — only payload bytes
28//! the engine flagged get zeroed. JVMs and analyzers tolerate that
29//! gracefully (zeroed string data renders as control characters, but
30//! the file remains structurally valid).
31
32use std::io::Read;
33use std::path::Path;
34
35use byteorder::{BigEndian, ByteOrder};
36use scrump_core::{
37    apply_hits_in_place, Chunk, ChunkOrigin, Format, Handler, Hit, Result, ScrumpError,
38};
39
40const MIN_MAGIC_PREFIX: &[u8] = b"JAVA PROFILE";
41
42#[derive(Clone, Debug)]
43struct RecordRange {
44    tag: u8,
45    /// Absolute file offset of the record body (after the 9-byte tag+ts+len triplet).
46    body_offset: u64,
47    body_len: u64,
48    /// Size of object IDs in this file (only relevant for tag 0x01 STRING records,
49    /// where the first `id_size` bytes of the body are the string id, NOT scannable).
50    id_size: u32,
51}
52
53pub struct Hprof {
54    bytes: Vec<u8>,
55    records: Vec<RecordRange>,
56}
57
58impl Hprof {
59    pub fn open_path(path: &Path) -> Result<Self> {
60        let mut f = std::fs::File::open(path)?;
61        let mut bytes = Vec::new();
62        f.read_to_end(&mut bytes)?;
63        Self::from_bytes(bytes)
64    }
65
66    pub fn from_bytes(bytes: Vec<u8>) -> Result<Self> {
67        if !bytes.starts_with(MIN_MAGIC_PREFIX) {
68            return Err(ScrumpError::InvalidFile(
69                "HPROF: missing 'JAVA PROFILE' magic prefix".into(),
70            ));
71        }
72        // Find the NUL terminator of the format string.
73        let nul = bytes
74            .iter()
75            .position(|&b| b == 0)
76            .ok_or_else(|| ScrumpError::InvalidFile("HPROF: missing NUL in header".into()))?;
77        // After NUL: u32 id_size, u32 ts_hi, u32 ts_lo
78        let header_after = nul + 1;
79        if bytes.len() < header_after + 12 {
80            return Err(ScrumpError::InvalidFile(
81                "HPROF: truncated header (need id_size + timestamp)".into(),
82            ));
83        }
84        let id_size = BigEndian::read_u32(&bytes[header_after..header_after + 4]);
85        if !(1..=16).contains(&id_size) {
86            return Err(ScrumpError::InvalidFile(format!(
87                "HPROF: implausible id_size {id_size}"
88            )));
89        }
90        let mut cursor = (header_after + 12) as u64;
91
92        let mut records = Vec::new();
93        while (cursor as usize) + 9 <= bytes.len() {
94            let off = cursor as usize;
95            let tag = bytes[off];
96            let length = BigEndian::read_u32(&bytes[off + 5..off + 9]) as u64;
97            let body_offset = cursor + 9;
98            let body_end = body_offset + length;
99            if (body_end as usize) > bytes.len() {
100                return Err(ScrumpError::InvalidFile(format!(
101                    "HPROF: record at {off:#x} (tag {tag:#x}, length {length}) extends past EOF ({} bytes)",
102                    bytes.len()
103                )));
104            }
105            records.push(RecordRange {
106                tag,
107                body_offset,
108                body_len: length,
109                id_size,
110            });
111            cursor = body_end;
112        }
113
114        Ok(Self { bytes, records })
115    }
116}
117
118fn tag_label(tag: u8) -> &'static str {
119    match tag {
120        0x01 => "HPROF_UTF8",
121        0x02 => "HPROF_LOAD_CLASS",
122        0x03 => "HPROF_UNLOAD_CLASS",
123        0x04 => "HPROF_FRAME",
124        0x05 => "HPROF_TRACE",
125        0x06 => "HPROF_ALLOC_SITES",
126        0x07 => "HPROF_HEAP_SUMMARY",
127        0x0A => "HPROF_START_THREAD",
128        0x0B => "HPROF_END_THREAD",
129        0x0C => "HPROF_HEAP_DUMP",
130        0x0D => "HPROF_CPU_SAMPLES",
131        0x0E => "HPROF_CONTROL_SETTINGS",
132        0x1C => "HPROF_HEAP_DUMP_SEGMENT",
133        0x2C => "HPROF_HEAP_DUMP_END",
134        _ => "HPROF_UNKNOWN",
135    }
136}
137
138impl Format for Hprof {
139    fn name(&self) -> &'static str {
140        "hprof"
141    }
142
143    fn chunks<'a>(&'a self) -> Box<dyn Iterator<Item = Chunk<'a>> + 'a> {
144        let mut out: Vec<Chunk<'a>> = Vec::new();
145        for r in &self.records {
146            if r.body_len == 0 {
147                continue;
148            }
149            let from = r.body_offset as usize;
150            let to = from + r.body_len as usize;
151            if to > self.bytes.len() {
152                continue;
153            }
154            // UTF8 STRING: skip the id at the start, yield the rest as a tight
155            // StringTable chunk; the whole body is also yielded as a generic
156            // chunk for redundancy in case the id_size assumption is off.
157            if r.tag == 0x01 && r.body_len > r.id_size as u64 {
158                let s_from = from + r.id_size as usize;
159                out.push(Chunk {
160                    bytes: &self.bytes[s_from..to],
161                    offset: s_from as u64,
162                    origin: ChunkOrigin::StringTable("hprof.utf8".into()),
163                });
164            }
165            out.push(Chunk {
166                bytes: &self.bytes[from..to],
167                offset: r.body_offset,
168                origin: ChunkOrigin::Section(format!("hprof.{}", tag_label(r.tag))),
169            });
170        }
171        Box::new(out.into_iter())
172    }
173
174    fn apply(&mut self, hits: &[Hit]) -> Result<()> {
175        apply_hits_in_place(&mut self.bytes, hits)
176    }
177
178    fn to_bytes(&self) -> Result<Vec<u8>> {
179        Ok(self.bytes.clone())
180    }
181}
182
183// ---- handler ---------------------------------------------------------------
184
185fn detect(head: &[u8], _path: &Path) -> bool {
186    head.starts_with(MIN_MAGIC_PREFIX)
187}
188
189fn open_path(path: &Path) -> Result<Box<dyn Format>> {
190    Ok(Box::new(Hprof::open_path(path)?))
191}
192
193fn open_bytes(bytes: Vec<u8>, _hint: Option<&Path>) -> Result<Box<dyn Format>> {
194    Ok(Box::new(Hprof::from_bytes(bytes)?))
195}
196
197pub fn handler() -> Handler {
198    Handler {
199        name: "hprof",
200        detect,
201        open_path,
202        open_bytes,
203    }
204}
205
206#[cfg(test)]
207mod tests {
208    use super::*;
209    use scrump_core::Replacement;
210
211    /// Build a tiny hprof file containing one UTF8 string record.
212    fn synth_hprof(planted: &str) -> Vec<u8> {
213        let mut f = Vec::new();
214        f.extend_from_slice(b"JAVA PROFILE 1.0.2\0");
215        f.extend_from_slice(&(8u32).to_be_bytes()); // id_size = 8
216        f.extend_from_slice(&(0u32).to_be_bytes()); // ts_hi
217        f.extend_from_slice(&(0u32).to_be_bytes()); // ts_lo
218                                                    // Record: UTF8 STRING (tag 0x01).
219        let body_len = 8 + planted.len();
220        f.push(0x01);
221        f.extend_from_slice(&(0u32).to_be_bytes()); // ts_delta
222        f.extend_from_slice(&(body_len as u32).to_be_bytes());
223        f.extend_from_slice(&(42u64).to_be_bytes()); // string id
224        f.extend_from_slice(planted.as_bytes());
225        // Heap dump end record (tag 0x2C, empty).
226        f.push(0x2C);
227        f.extend_from_slice(&(0u32).to_be_bytes());
228        f.extend_from_slice(&(0u32).to_be_bytes());
229        f
230    }
231
232    #[test]
233    fn detect_recognises_magic() {
234        assert!(detect(b"JAVA PROFILE 1.0.2\0xxx", Path::new("/x/a.hprof")));
235        assert!(!detect(b"random", Path::new("/x/a")));
236    }
237
238    #[test]
239    fn parses_synthetic_hprof() {
240        let token = "ghp_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
241        let f = synth_hprof(token);
242        let pre_len = f.len();
243        let mut h = Hprof::from_bytes(f).unwrap();
244        let chunks: Vec<_> = h.chunks().collect();
245        // At least: UTF8 StringTable + UTF8 Section + HEAP_DUMP_END (skipped, len=0).
246        let saw_string = chunks
247            .iter()
248            .any(|c| matches!(&c.origin, ChunkOrigin::StringTable(s) if s == "hprof.utf8"));
249        assert!(saw_string);
250
251        // Find the planted token's offset.
252        let pos = h
253            .bytes
254            .windows(token.len())
255            .position(|w| w == token.as_bytes())
256            .unwrap() as u64;
257        h.apply(&[Hit {
258            offset: pos,
259            len: token.len(),
260            rule_id: "x".into(),
261            verified: None,
262            replacement: Replacement::ZeroFill,
263            origin: ChunkOrigin::StringTable("hprof.utf8".into()),
264        }])
265        .unwrap();
266        let out = h.to_bytes().unwrap();
267        assert_eq!(out.len(), pre_len);
268        assert!(!out.windows(token.len()).any(|w| w == token.as_bytes()));
269        assert!(out.starts_with(MIN_MAGIC_PREFIX));
270    }
271}