palmdoc_compression/
palmdoc.rs

1use crate::{
2    hashtable::HashTable,
3    window::{Window, MAX_MATCH_LEN},
4};
5use thiserror::Error;
6
7pub fn compress(data: &[u8]) -> Vec<u8> {
8    let mut out = Vec::with_capacity(data.len());
9
10    let mut window = Window::new();
11    let mut table = HashTable::new();
12
13    let mut offset = 0;
14    while offset < data.len() {
15        let remainder = &data[offset..];
16        if remainder.len() > 3 {
17            let hash = table.hash(&remainder[..3]);
18            table.insert(hash, window.position as u16);
19            if let Some((distance, length)) = table.reference(hash, remainder, &window, offset) {
20                // todo: this matches Calibre behavior where it doesn't encode length distance pairs that are close to the beginning or end of the data, but is this an actual PalmDoc limitation?
21                if MAX_MATCH_LEN < offset && offset < data.len() - MAX_MATCH_LEN {
22                    let m = distance as u16;
23                    let code = 0x8000 + ((m << 3) & 0x3ff8) + ((length as u16) - 3);
24                    out.extend(&code.to_be_bytes());
25
26                    for _ in 0..length {
27                        if offset + 3 < data.len() {
28                            let hash = table.hash(&data[offset..offset + 3]);
29                            table.insert(hash, window.position as u16);
30                        }
31                        window.push(data[offset]);
32                        offset += 1;
33                    }
34
35                    continue;
36                }
37            }
38        }
39
40        // Single byte encoding or special cases handling
41        let byte = data[offset];
42        offset += 1;
43        window.push(byte);
44
45        if byte == b' ' && offset + 1 < data.len() && (0x40..0x80).contains(&data[offset]) {
46            out.push(data[offset] ^ 0x80);
47
48            if offset + 3 < data.len() {
49                table.insert(table.hash(&data[offset..offset + 3]), offset as u16);
50            }
51
52            window.push(data[offset]);
53            offset += 1;
54            continue;
55        }
56
57        if byte == 0 || (byte > 8 && byte < 0x80) {
58            out.push(byte);
59        } else {
60            let mut j = offset;
61            let mut binseq = Vec::with_capacity(8);
62            binseq.push(byte);
63
64            while j < data.len() && binseq.len() < 8 {
65                let ch = data[j];
66                if ch == 0 || (ch > 8 && ch < 0x80) {
67                    break;
68                }
69
70                binseq.push(ch);
71
72                if j + 3 < data.len() {
73                    table.insert(table.hash(&data[j..j + 3]), j as u16);
74                }
75                window.push(ch);
76
77                j += 1;
78            }
79
80            out.extend(&(binseq.len() as u8).to_be_bytes());
81            out.extend(&binseq);
82            offset += binseq.len() - 1;
83        }
84    }
85
86    out
87}
88
89#[derive(Error, Debug)]
90pub enum DecompressError {
91    #[error("offset to LZ77 bits is outside of the data")]
92    OffsetOutsideData,
93    #[error("LZ77 decompression offset is invalid")]
94    InvalidOffset,
95    #[error("not enough data")]
96    NotEnoughData,
97}
98
99pub fn decompress(data: &[u8]) -> Result<Vec<u8>, DecompressError> {
100    // Adapted from https://metacpan.org/release/AZED/EBook-Tools-0.3.3/source/lib/EBook/Tools/PalmDoc.pm
101    let len = data.len();
102    let mut offset = 0;
103    let mut uncompressed = Vec::new();
104
105    while offset < len {
106        let byte = data[offset];
107        offset += 1;
108
109        if (1..=8).contains(&byte) {
110            // Next bytes are literals
111            if offset + byte as usize > len {
112                return Err(DecompressError::NotEnoughData);
113            }
114
115            uncompressed.extend_from_slice(&data[offset..(offset + byte as usize)]);
116            offset += byte as usize;
117        } else if byte < 128 {
118            // Values from 0x09 through 0x7f are literal
119            uncompressed.push(byte);
120        } else if byte >= 192 {
121            // 0xc0 - 0xff are single characters (XOR 0x80) preceded by a space
122            uncompressed.push(b' ');
123            uncompressed.push(byte ^ 0x80);
124        } else if offset < len {
125            // Data is LZ77-compressed
126            offset += 1;
127
128            if offset > len {
129                return Err(DecompressError::OffsetOutsideData);
130            }
131
132            let mut lz77 = u16::from_be_bytes([data[offset - 2], data[offset - 1]]);
133
134            // Leftmost two bits are ID bits and need to be dropped
135            lz77 &= 0x3fff;
136
137            // Length is rightmost 3 bits + 3
138            let lz77length = ((lz77 & 0x0007) as usize) + 3;
139
140            // Remaining 11 bits are offset
141            let lz77offset = (lz77 >> 3) as usize;
142
143            // 0-run
144            if lz77offset == 0 {
145                let empty = vec![0u8; lz77length];
146                uncompressed.extend_from_slice(&empty);
147                continue;
148            }
149
150            // Getting text from the offset
151            let mut textlength = uncompressed.len();
152            for _ in 0..lz77length {
153                if let Some(textpos) = textlength.checked_sub(lz77offset) {
154                    uncompressed.push(uncompressed[textpos]);
155                    textlength += 1;
156                } else {
157                    return Err(DecompressError::InvalidOffset);
158                }
159            }
160        }
161    }
162
163    Ok(uncompressed)
164}
165
166#[cfg(test)]
167mod tests {
168    use lipsum::lipsum;
169    use pretty_assertions::assert_eq;
170
171    use super::*;
172
173    fn get_calibre_testcases() -> Vec<(Vec<u8>, Vec<u8>)> {
174        // Test cases taken from Calibre
175        // (input, compressed_result)
176        return vec![
177            (
178                hex::decode("616263030405066d73").unwrap(),
179                hex::decode("61626304030405066d73").unwrap(),
180            ),
181            (
182                hex::decode("612062206320fe6420").unwrap(),
183                hex::decode("61e2e32001fe6420").unwrap(),
184            ),
185            (
186                hex::decode("303132333435363738396178797a326278797a3263646667666f39697579657268")
187                    .unwrap(),
188                hex::decode("303132333435363738396178797a3262802963646667666f39697579657268")
189                    .unwrap(),
190            ),
191            (
192              hex::decode("30313233343536373839617364303132333435363738396173647c79797a7a7878666668686a6a6b6b").unwrap(),
193              hex::decode("30313233343536373839617364806f80687c79797a7a7878666668686a6a6b6b").unwrap()
194            ),
195            (
196              hex::decode("6369657761636e6171206569753734332072373837712030772520203b207361206664ef0c6664786f73616320776f636a702061636f6965636f776569206f77616963206a6f63696f7761706a636976636a706f69766a706f726569766a706f617663613b207039617738373433793672373425245e245e253820").unwrap(),
197              hex::decode("6369657761636e6171e56975373433f2373837712030772520203bf361e66401ef0c6664786f736163f76f636a70e1636f6965636f776569ef77616963ea6f63698050706a63697681086f697680287265803a617663613bf03961773882f0793672373425245e245e253820").unwrap()
198            ),
199            (
200                hex::decode("61626373646661736466616263646173646f66617373").unwrap(),
201                hex::decode("61626373646661736466616263646173646f66617373").unwrap(),
202            ),
203        ];
204    }
205
206    #[test]
207    fn test_compress_palmdoc() {
208        for (input, expected) in get_calibre_testcases() {
209            let compressed = compress(&input);
210            assert_eq!(compressed, expected);
211        }
212    }
213
214    #[test]
215    fn test_decompress_palmdoc() {
216        for (expected, compressed) in get_calibre_testcases() {
217            let decompressed = decompress(&compressed).unwrap();
218            assert_eq!(decompressed, expected);
219        }
220    }
221
222    #[test]
223    fn test_roundtrip() {
224        let input = lipsum(4096);
225        let input = input.as_bytes()[..4096].to_vec();
226
227        let compressed = compress(&input);
228        let decompressed = decompress(&compressed).unwrap();
229        assert_eq!(input, decompressed);
230    }
231
232    #[test]
233    fn test_null_run_length() {
234        let compressed = hex::decode("61726973f46ff6697302c2ad6974cd61646502c2ad6d6f6902c2ad73656c6c65c7656f7267653bf4686174e174e86572e86f7573658050e3616d65f57002c2ad6f6ec26f6e6102c2ad70617274652cf7686fe16c82206fe56e02c2ad6a6f796564821865e681206d81f8e16302c2ad747265737303e2809980b2766f72732ce16e81418340696ee884d870811802c2ad656e6365ce61706f6c6583506861839070656e82996f81e06c6c817082706fef6e65ef66834c696e7486986e67e669748722776869636885b9776173f3756284c065637483a38099746884c17481fa64756384c873ed657202c2ad63792ed480a86c61829074880873873883f068696d859585586d616702c2ad88216e696d83c881507988af74658393738af071756584d96c79f2807070616964e279e4656174682e3c2f703e0a0909093c70e36c6173733d2263616c69627265313722e169643d22374b344755223e848173746f02c2ad727985ea76658048897085117984aa88a285f96573884b2ce58a607083a063698ce984088774706f8968890065726580827285f076616c89316402c2ad648e1985728da06f8729697a87f08b696180d06f74688320723b8854893964696573ec6f6f6b813888b1886989e16564870f870f870f870e56223e03e2809c436861728a596e672103e2809df389514183906e61d002c3a1766c6f7680627769746883e8878a71756987908fb1676c6191b187146c6988c9968070726980b87373844f844f844f844d4830844f844f92488a4a902182ef82ef9640737469636b844b81116e658bb0648110954c9a31776f726be173e995886ff48ce17489486679980b81b98e1f739be88af86661813063813092b974699868977c905d90108dd893029509833866726f6de780f084618188897269748807880788078806319353768398636f6d96309be2832984318ea397fb73696c83b884086169a0e88569736d6993308c627261748198667593b382029a1b87d0636f8731875975a1d0627574ea75877886b06e8f778f76a2e3686164eb657088a897e87463940882e0e5799ff86e8882796f7583c06d90b08179a438616c8e18889b2cee88788d0885daa6a2a003616c8d9d6f6f95f897e19921865876858868a8c99dd58a328349619ea86202c3a98f908338808068758d78726986db9a628ec96375652ed06965729b0885e184c889f06781337374a1008650884a9e40a1308ef5833f833b80406f89088091629e19971a9018706f779c21a475a55f9fa089b88d589e2885d392a7a0b98410a3e0899784c8a7c273a621706c652d6d696e883881386584a06781c16e970a898965788121ab4889b3ad826584f2a0b094792ea808860877a2c28af685216c9a1174a2c98b8583068b9c80a16e8631758250a57b8fe1ae1983e177687990f7a0676482c195236f76a37f97379737973748329f3297496d65616eb028726506e2808ae280a68b578b578b52b5014584d16f70976ba9bb6768b3589c9c706588208a182ca1998e2701a9b3fb617987da2e2083c849b77881b8ab336eab3191a173619e80666f72ab4a83bc97439fee6c696b65d2758169696103e2809462618110802a6963a2d994899cd9a9c993486286b0019480408da086689850bf2a6689ab90c790c3b107815995a086b89a788c52698872686192d9bb098528bb216fba9c81a26d90c29fb96e89e58a6f8a6f8a6a80588a6d961b69b4706f756ca2206176b49ba9706c64abf18e978e978e978e95338e92429960686f778e819701882167950a86c0844e3f8d019ce48cd96285986785e16e8d01837f837f837f837d34223e4182c46d8fb0a13994f794f763616d65f57087192cb9028a0373ada2a8a98b9b846b885073ba0987f14999f081608c008679af836f648fde6e84e09f206dab302ed481879e6c668f798691a350a0346368616ea3f98b69b2a17375a8e982486f6693209cc17369a7a9848980926390aaa73875be8094d19feaa6ea739458a347a34568952862b8c98ec8616c8bd968696d9db8aeb0a74d93b267a72b776fa388656e8b878b878b878b86358efa49e16dabc185798689a687934972b3f1955980e87993fd836087a263b45974758f98ba0d8db8b6996574a3386d6f80c886d970b74aa3f3815566658550bc19699cc081bf81b8a0a0a50b4998e1af5399fa87819e496692ea87a1ba12658b188a9891ba819c6e6f74f9951881da699170b232696e6b83f58f4ea2ba9ed2686589b789b789b789b636223e4e83006c65b0e1abc2a4b788b29834b592619c619627bc079eea89098e3c88518fad85706b65657081216df580f8649ef1831073a2d176a5c4bf30726f75a9608143a142b96bbd2892f881a063698180636c86d63c3f786d6cbc387293693d22312e30228fb8636f6486383d225554462d38223f3e0a3c6874813881586e8818687474703a2f2f7777772e77332e6f72672f313939392f78812122835881d1656e2d5553223e0a20203ca729804980587469746c653e49563c2f805380b36d657461e882a82d65717569763d22436f6eb618742d5479706522887980713d22746578742f83293b9b39727365743d75746685082f83426c8e7172656c3d227374796c65ad20657422f4822081dc63737322e87265663d226bbbf06c653a666c6f773a303030313f6d90a03d81368288827f827f827f827f827f827f32827f827e2f883d3c626f64a14092779270926438494c323089a88cb873a0c8b451646174612d70ab4088098040742d312d3122e98170882070b2c880892d3422f26f6c84d0646f632d80cc89d0832b657075622d85c92d8bb2706167838f4c3231838a093c2f839c80703c2f852185aa8b708ff134858f97fb36821732bf308e996834995f816266697273748470696c64817732b2104a7573951ab7b09e09746894da76b972959972a3d28618ad34647261b6809efa6f6f6d3ad07269b7499a29647202c3a979c26f6c6b02c3b3aae902c2ad6b699a5b6ca9218778f081727373ad98e8aec9629c982ec8b4b36197e9aab16ebcd0736fa0d0b68883506d619a78a0e0adf884a0756da070699ad02caab386d86d2c874065617263b86066659c11a7b0732ec57602c2ad6572009b888001840287").unwrap();
235
236        let expected_data = hex::decode("6172697320746f20766973c2ad6974204d616465c2ad6d6f69c2ad73656c6c652047656f7267653b20746861742061742068657220686f7573652068652063616d65207570c2ad6f6e20426f6e61c2ad70617274652c2077686f20616cc2ad736f20656ec2ad6a6f79656420746865206661c2ad6d6f7573206163c2ad7472657373e28099206661c2ad766f72732c20616e64207468617420696e206869732070726573c2ad656e6365204e61706f6c656f6e20686170c2ad70656e656420746f2066616c6c20696ec2ad746f206f6e65206f6620746865206661696e74c2ad696e67206669747320746f2077686963682068652077617320737562c2ad6a6563742c20616e642077617320746875732061742074686520647563e2809973206d6572c2ad63792e20546865206c6174c2ad746572207370617265642068696d2c20616e642074686973206d6167c2ad6e61c2ad6e696dc2ad69c2ad747920426f6e61c2ad706172746520737562c2ad7365c2ad7175656e74c2ad6c79207265c2ad706169642062792064656174682e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344755223e5468652073746fc2ad72792077617320766572792070726574c2ad747920616e6420696ec2ad746572c2ad657374c2ad696e672c206573c2ad7065c2ad6369616cc2ad6c792061742074686520706f696e7420776865726520746865207269c2ad76616c7320737564c2ad64656ec2ad6c7920726563c2ad6f67c2ad6e697a6564206f6e6520616ec2ad6f7468c2ad65723b20616e6420746865206c6164696573206c6f6f6b6564206167c2ad69c2ad746174c2ad65642e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344756223ee2809c436861726dc2ad696e6721e2809d207361696420416ec2ad6e612050c3a1766c6f76c2ad6e61207769746820616e20696ec2ad71756972c2ad696e6720676c616e636520617420746865206c6974c2ad746c65207072696e636573732e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344830223ee2809c436861726dc2ad696e6721e2809d2077686973c2ad706572656420746865206c6974c2ad746c65207072696e636573732c20737469636bc2ad696e6720746865206e6565c2ad646c6520696ec2ad746f2068657220776f726b20617320696620746f20746573c2ad7469c2ad667920746861742074686520696ec2ad746572c2ad65737420616e6420666173c2ad6369c2ad6e61c2ad74696f6e206f66207468652073746fc2ad727920707265c2ad76656e74c2ad6564206865722066726f6d20676fc2ad696e67206f6e20776974682069742e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344831223e546865207669c2ad636f6d7465206170c2ad707265c2ad6369c2ad6174c2ad656420746869732073696c656e742070726169736520616e6420736d696cc2ad696e67206772617465c2ad66756cc2ad6c7920707265c2ad706172656420746f20636f6ec2ad74696ec2ad75652c20627574206a757374207468656e20416ec2ad6e612050c3a1766c6f76c2ad6e612c2077686f20686164206b6570742061207761746368c2ad66756c20657965206f6e2074686520796f756e67206d616e2077686f20736f20616c61726d6564206865722c206e6fc2ad74696365642074686174206865207761732074616c6bc2ad696e6720746f6f206c6f7564c2ad6c7920616e64207665c2ad6865c2ad6d656e74c2ad6c79207769746820746865206162c2ad62c3a92c20736f2073686520687572c2ad7269656420746f2074686520726573c2ad6375652e2050696572726520686164206d616ec2ad6167656420746f207374617274206120636f6ec2ad766572c2ad7361c2ad74696f6e207769746820746865206162c2ad62c3a92061626f7574207468652062616cc2ad616e6365206f6620706f77c2ad65722c20616e6420746865206c6174c2ad7465722c206576c2ad69c2ad64656e74c2ad6c7920696ec2ad746572c2ad657374c2ad65642062792074686520796f756e67206d616ee28099732073696dc2ad706c652d6d696e64c2ad6564206561c2ad676572c2ad6e6573732c20776173206578c2ad706c61696ec2ad696e67206869732070657420746865c2ad6fc2ad72792e20426f746820776572652074616c6bc2ad696e6720616e64206c6973c2ad74656ec2ad696e6720746f6f206561c2ad676572c2ad6c7920616e6420746f6f206e6174c2ad75c2ad72616cc2ad6c792c207768696368207761732077687920416ec2ad6e612050c3a1766c6f76c2ad6e6120646973c2ad6170c2ad70726f7665642e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344832223ee2809c546865206d65616e7320617265e2808ae280a6207468652062616cc2ad616e6365206f6620706f77c2ad657220696e204575c2ad726f706520616e642074686520726967687473206f66207468652070656fc2ad706c652ce2809d20746865206162c2ad62c3a92077617320736179c2ad696e672e20e2809c4974206973206f6ec2ad6c79206e6563c2ad6573c2ad7361727920666f72206f6e6520706f77c2ad6572c2ad66756c206e61c2ad74696f6e206c696b6520527573c2ad736961e28094626172c2ad626172c2ad696320617320736865206973207361696420746f206265e28094746f20706c61636520686572c2ad73656c6620646973c2ad696ec2ad746572c2ad657374c2ad6564c2ad6c79206174207468652068656164206f6620616e20616cc2ad6c69616e636520686176c2ad696e6720666f7220697473206f62c2ad6a65637420746865206d61696ec2ad7465c2ad6e616e6365206f66207468652062616cc2ad616e6365206f6620706f77c2ad6572206f66204575c2ad726f70652c20616e6420697420776f756c6420736176652074686520776f726c6421e2809d3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344833223ee2809c42757420686f772061726520796f7520746f2067657420746861742062616cc2ad616e63653fe2809d2050696572726520776173206265c2ad67696ec2ad6e696e672e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344834223e41742074686174206d6fc2ad6d656e7420416ec2ad6e612050c3a1766c6f76c2ad6e612063616d6520757020616e642c206c6f6f6bc2ad696e67207365c2ad76657265c2ad6c79206174205069657272652c2061736b656420746865204974616cc2ad69616e20686f772068652073746f6f6420527573c2ad7369616e20636c69c2ad6d6174652e20546865204974616cc2ad69616ee2809973206661636520696ec2ad7374616e74c2ad6c79206368616e67656420616e64206173c2ad73756d656420616e206f66c2ad66656ec2ad73697665c2ad6c79206166c2ad66656374c2ad65642c20737567c2ad617279206578c2ad70726573c2ad73696f6e2c206576c2ad69c2ad64656e74c2ad6c79206861c2ad626974c2ad75c2ad616c20746f2068696d207768656e20636f6ec2ad76657273c2ad696e67207769746820776f6dc2ad656e2e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344835223ee2809c4920616d20736f20656ec2ad6368616e74c2ad656420627920746865206272696cc2ad6c69616ec2ad6379206f66207468652077697420616e642063756cc2ad74757265206f662074686520736fc2ad6369c2ad6574792c206d6f7265206573c2ad7065c2ad6369616cc2ad6c79206f66207468652066656dc2ad69c2ad6e696e6520736fc2ad6369c2ad6574792c20696e20776869636820492068617665206861642074686520686f6ec2ad6f72206f66206265c2ad696e67207265c2ad6365697665642c207468617420492068617665206e6f7420796574206861642074696d6520746f207468696e6b206f662074686520636c69c2ad6d6174652ce2809d20736169642068652e3c2f703e0a0909093c7020636c6173733d2263616c69627265313722206169643d22374b344836223e4e6f74206c6574c2ad74696e6720746865206162c2ad62c3a920616e6420506965727265206573c2ad636170652c20416ec2ad6e612050c3a1766c6f76c2ad6e612c20746865206d6f726520636f6ec2ad7665c2ad6e69656e74c2ad6c7920746f206b656570207468656d20756ec2ad646572206f62c2ad736572c2ad7661c2ad74696f6e2c2062726f75676874207468656d20696ec2ad746f20746865206c617267c2ad657220636972c2ad636c652e3c2f703e0a09093c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d38223f3e0a3c68746d6c20786d6c6e733d22687474703a2f2f7777772e77332e6f72672f313939392f7868746d6c22206c616e673d22656e2d5553223e0a20203c686561643e0a202020203c7469746c653e49563c2f7469746c653e0a202020203c6d65746120687474702d65717569763d22436f6e74656e742d547970652220636f6e74656e743d22746578742f68746d6c3b20636861727365743d7574662d38222f3e0a20203c6c696e6b2072656c3d227374796c6573686565742220747970653d22746578742f6373732220687265663d226b696e646c653a666c6f773a303030313f6d696d653d746578742f637373222f3e0a3c6c696e6b2072656c3d227374796c6573686565742220747970653d22746578742f6373732220687265663d226b696e646c653a666c6f773a303030323f6d696d653d746578742f637373222f3e0a3c2f686561643e0a20203c626f647920636c6173733d2263616c6962726522206169643d2238494c3230223e0a09093c73656374696f6e20646174612d706172656e743d22706172742d312d31222069643d22636861707465722d312d312d342220726f6c653d22646f632d636861707465722220636c6173733d22657075622d747970652d7469746c657061676522206169643d2238494c3231223e0a0909093c2f73656374696f6e3e0a093c2f626f64793e0a3c2f68746d6c3e0a3c683420636c6173733d2263616c69627265313622206169643d2238494c3232223e49563c2f68343e0a0909093c7020636c6173733d2266697273742d6368696c6422206169643d2238494c3233223e4a757374207468656e20616ec2ad6f7468c2ad657220766973c2ad69c2ad746f7220656ec2ad7465726564207468652064726177c2ad696e6720726f6f6d3a205072696e636520416ec2ad6472c3a97920426f6c6bc3b36ec2ad73c2ad6b692c20746865206c6974c2ad746c65207072696e63657373e2809920687573c2ad62616e642e20486520776173206120766572792068616e64c2ad736f6d6520796f756e67206d616e2c206f66206d656469c2ad756d206865696768742c2077697468206669726d2c20636c65617263757420666561c2ad74757265732e204576c2ad657200617267000000006c65207072").unwrap();
237
238        let decompressed = decompress(&compressed).unwrap();
239
240        assert_eq!(decompressed, expected_data);
241    }
242}