rusty_axml/chunks/
string_pool.rs

1#![allow(dead_code)]
2
3//! String pool
4//!
5//! The string pool is the set of strings used in the AXML files. All
6//! of these strings can be then referenced by the chunks. This reduces
7//! the size of the binary XML as there is no duplication of strings
8//! anymore.
9
10use crate::{
11    chunks::{
12        chunk_header::ChunkHeader,
13        chunk_types::ChunkType,
14    },
15    errors::AxmlError
16};
17
18use std::io::{
19    Read,
20    Cursor,
21};
22
23use byteorder::{
24    LittleEndian,
25    ReadBytesExt
26};
27
28/// String pool structure
29///
30/// The data of the string pool is an array of `u32` that provides the
31/// indices in the pool. The pool itself is located at `strings_start`
32/// offset. Each item of the pool is composed of:
33///      - the string length (16 bits, more details below)
34///      - the string (in UTF-16 format)
35///      - a terminator (`0x0000`)
36///
37/// The length is 16 bits long, but the system only uses 15 bits,
38/// which means that the maximum length of a string is 32,676
39/// characters. If a string has more than 32767 characters, the high
40/// bit of the length is set and the 15 remaining bits represent the
41/// high word of the total length. In this case, the length will be
42/// immediately followed by another 16 bits which represent the low
43/// end of the string length. This means the format allows for string
44/// lengths up to 2,147,483,648 characters.
45///
46/// If `style_count` is not zero, then immediately following the array
47/// of indices into the string table is another array of indices into
48/// a style table starting at `styles_start`. Each entry in the style
49/// table is an array of `string_pool_span` structures.
50#[derive(Debug)]
51pub struct StringPool {
52    /// Chunk header
53    header: ChunkHeader,
54
55    /// Number of strings in this pool (that is, number of `u32`
56    /// indices that follow in the data)
57    string_count: u32,
58
59    /// Number of style span arrays in the pool (that is, number
60    /// of `u32` indices follow the string indices)
61    style_count: u32,
62
63    /// Flags. There are two possible flags:
64    ///     - `is_sorted`: if set, the string pool is sorted by
65    ///       UTF-16 string values
66    ///     - `is_utf8`: if set, the string pool is encoded in
67    ///       UTF-8 and not UTF-16
68    is_sorted: bool,
69    is_utf8: bool,
70
71    /// Offset from the header to the string data
72    strings_start: u32,
73
74    /// Offset from the header to the style data
75    styles_start: u32,
76
77    /// List of strings offsets
78    strings_offsets: Vec<u32>,
79
80    /// List of styles offsets
81    styles_offsets: Vec<u32>,
82
83    /// The strings from the pool
84    strings: Vec<String>,
85
86    /// The styles from the pool
87    styles: Vec<StringPoolSpan>,
88}
89
90impl StringPool {
91    /// Parse the string pool from the raw data
92    pub fn from_buff(axml_buff: &mut Cursor<Vec<u8>>,
93                 global_strings: &mut Vec<String>) -> Result<Self, AxmlError> {
94
95        // Go back 2 bytes, to account from the block type
96        let initial_offset = axml_buff.position() - 2;
97        axml_buff.set_position(initial_offset);
98        let initial_offset = initial_offset as u32;
99
100        // Parse chunk header
101        let header = ChunkHeader::from_buff(axml_buff, ChunkType::ResStringPoolType)?;
102
103        // Get remaining members
104        let string_count = axml_buff.read_u32::<LittleEndian>()?;
105        let style_count = axml_buff.read_u32::<LittleEndian>()?;
106        let flags = axml_buff.read_u32::<LittleEndian>()?;
107        let is_sorted = (flags & (1<<0)) != 0;
108        let is_utf8 = (flags & (1<<8)) != 0;
109        let strings_start = axml_buff.read_u32::<LittleEndian>()?;
110        let styles_start = axml_buff.read_u32::<LittleEndian>()?;
111
112        // Get strings offsets
113        let mut strings_offsets = Vec::new();
114        for _ in 0..string_count {
115            let offset = axml_buff.read_u32::<LittleEndian>()?;
116            strings_offsets.push(offset);
117        }
118
119        // Get styles offsets
120        let mut styles_offsets = Vec::new();
121        for _ in 0..style_count {
122            let offset = axml_buff.read_u32::<LittleEndian>()?;
123            styles_offsets.push(offset);
124        }
125
126        // Strings
127        for offset in strings_offsets.iter() {
128            // let current_start = (strings_start + offset + 8) as u64;
129            let current_start = (initial_offset + strings_start + offset) as u64;
130            axml_buff.set_position(current_start);
131
132            // let char_count: u16; // This will be handled per branch
133            let decoded_string: String;
134
135            if is_utf8 {
136                // Read UTF-8 character count (spec calls this "UTF-16 length")
137                // This is the number of characters, not bytes.
138                let mut first_byte_char_count = axml_buff.read_u8()? as u16;
139
140                // Mark as unused for now, but read it as per spec.
141                let _utf8_char_count = if (first_byte_char_count & 0x80) != 0 {
142                    first_byte_char_count &= 0x7F; // Mask out the high bit
143                    (first_byte_char_count << 8) | (axml_buff.read_u8()? as u16)
144                } else {
145                    first_byte_char_count
146                };
147
148                // Read UTF-8 byte length (spec calls this "UTF-8 length")
149                let mut first_byte_byte_len = axml_buff.read_u8()? as u16;
150                // Renamed from _encoded_size
151                let byte_len: u16 = if (first_byte_byte_len & 0x80) != 0 {
152                    first_byte_byte_len &= 0x7F; // Mask out the high bit
153                    (first_byte_byte_len << 8) | (axml_buff.read_u8()? as u16)
154                } else {
155                    first_byte_byte_len
156                };
157
158                // Use byte_len to read the string data
159                let mut str_buff = Vec::with_capacity(byte_len as usize);
160                let mut chunk = axml_buff.take(byte_len as u64);
161
162                chunk.read_to_end(&mut str_buff)?;
163                decoded_string = String::from_utf8(str_buff)?;
164                axml_buff.read_u8()?; // Consume UTF-8 null terminator (0x00)
165                                      // TODO: According to AOSP comments for resources.arsc, UTF-8 strings might have 2 or 4 byte terminators.
166                                      // This needs verification if parsing resources.arsc strings yields errors or incorrect cursor positions.
167            } else { // UTF-16
168                let char_count = axml_buff.read_u16::<LittleEndian>()?; // UTF-16 length in characters
169                let actual_decoded_string = if char_count > 0 {
170                    let mut str_chars = Vec::with_capacity(char_count as usize);
171                    for _ in 0..char_count {
172                        // It's possible for a read error here if char_count is erroneously large.
173                        // Consider adding error handling or further validation if issues arise.
174                        str_chars.push(axml_buff.read_u16::<LittleEndian>()?);
175                    }
176                    std::char::decode_utf16(str_chars.into_iter())
177                                         .collect::<Result<String, _>>()?
178                } else {
179                    String::new()
180                };
181                axml_buff.read_u16::<LittleEndian>()?; // Consume UTF-16 null terminator (0x0000)
182                decoded_string = actual_decoded_string;
183            }
184
185            // All strings, including empty ones, are added to the pool.
186            global_strings.push(decoded_string);
187        }
188
189        let mut styles = Vec::new();
190        for offset in styles_offsets.iter() {
191            let current_start = (initial_offset + strings_start + offset) as u64;
192            axml_buff.set_position(current_start);
193
194            let string_pool_ref = StringPoolRef {
195                index: axml_buff.read_u32::<LittleEndian>()?
196            };
197            let first_char = axml_buff.read_u32::<LittleEndian>()?;
198            let last_char = axml_buff.read_u32::<LittleEndian>()?;
199
200            styles.push(StringPoolSpan {
201                name: string_pool_ref,
202                first_char,
203                last_char
204            });
205        }
206
207        Ok(StringPool {
208            header,
209            string_count,
210            style_count,
211            is_sorted,
212            is_utf8,
213            strings_start,
214            styles_start,
215            strings_offsets,
216            styles_offsets,
217            strings: global_strings.to_vec(),
218            styles
219        })
220    }
221}
222
223/// Reference to a string in a string pool.
224#[derive(Debug)]
225struct StringPoolRef {
226    /// Index into the string pool table (uint32_t-offset from the indices
227    /// immediately after ResStringPool_header) at which to find the location
228    /// of the string data in the pool.
229    index: u32,
230}
231
232/// String pool span
233///
234/// This structure defines a span of style information associated
235/// with a string in the pool.
236#[derive(Debug)]
237struct StringPoolSpan {
238    /// Name of the span
239    ///
240    /// This is the name of the XML tag that defined it.
241    /// There is a special value END (0xFFFFFFFF) that indicates the
242    /// end of an array of spans.
243    name: StringPoolRef,
244
245    /// The first of the characters in the string that this span applies to
246    first_char: u32,
247
248    /// The last of the characters in the string that this span applies to
249    last_char: u32,
250}
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255    use std::io::{ Cursor, Write };
256    use byteorder::{LittleEndian, WriteBytesExt};
257
258    // Helper function to create a simple buffer for testing
259    fn create_test_buffer() -> Cursor<Vec<u8>> {
260        let mut buf = Vec::new();
261
262        // Chunk header
263        buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
264        buf.write_u16::<LittleEndian>(8).unwrap();      // Chunk header size
265        buf.write_u32::<LittleEndian>(128).unwrap();    // Chunk data size
266
267        // String pool header
268        buf.write_u32::<LittleEndian>(2).unwrap();      // string_count
269        buf.write_u32::<LittleEndian>(0).unwrap();      // style_count
270        buf.write_u32::<LittleEndian>(1).unwrap();      // flags (is_sorted, not is_utf8)
271        buf.write_u32::<LittleEndian>(36).unwrap();     // strings_start
272        buf.write_u32::<LittleEndian>(20).unwrap();     // styles_start
273        buf.write_u32::<LittleEndian>(0).unwrap();      // first string offset
274        buf.write_u32::<LittleEndian>(14).unwrap();     // second string offset
275
276        // Add mock string offsets and string data
277        buf.write_u16::<LittleEndian>(5).unwrap(); // Length of first string (UTF-16)
278        buf.write_u16::<LittleEndian>(0x0048).unwrap(); // 'H'
279        buf.write_u16::<LittleEndian>(0x0065).unwrap(); // 'e'
280        buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
281        buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
282        buf.write_u16::<LittleEndian>(0x006F).unwrap(); // 'o'
283        buf.write_u16::<LittleEndian>(0x0000).unwrap(); // Null terminator
284
285        buf.write_u16::<LittleEndian>(5).unwrap(); // Length of second string (UTF-16)
286        buf.write_u16::<LittleEndian>(0x0057).unwrap(); // 'W'
287        buf.write_u16::<LittleEndian>(0x006F).unwrap(); // 'o'
288        buf.write_u16::<LittleEndian>(0x0072).unwrap(); // 'r'
289        buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
290        buf.write_u16::<LittleEndian>(0x0064).unwrap(); // 'd'
291        buf.write_u16::<LittleEndian>(0x0000).unwrap(); // Null terminator
292
293        Cursor::new(buf)
294    }
295
296    #[test]
297    fn test_string_pool_parse_utf16() {
298        // Create a test buffer
299        let mut buffer = create_test_buffer();
300
301        // The `from_buff` function assumes we have read the chunk type already
302        buffer.read_u16::<LittleEndian>().unwrap();
303
304        let mut global_strings = Vec::new();
305
306        // Parse string pool from buffer
307        let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
308
309        // Validate that the string pool is parsed correctly
310        assert_eq!(string_pool.strings.len(), 2);
311        assert_eq!(string_pool.strings[0], "Hello");
312        assert_eq!(string_pool.strings[1], "World");
313    }
314
315    #[test]
316    fn test_string_pool_flags() {
317        let mut buffer = create_test_buffer();
318
319        // The `from_buff` function assumes we have read the chunk type already
320        buffer.read_u16::<LittleEndian>().unwrap();
321
322        let mut global_strings = Vec::new();
323
324        // Parse string pool from buffer
325        let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
326
327        // Validate the flags
328        assert!(string_pool.is_sorted);
329        assert!(!string_pool.is_utf8);
330    }
331
332    #[test]
333    fn test_empty_pool() {
334        // Test case with no strings in the pool
335        let mut buf = Vec::new();
336
337        buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
338        buf.write_u16::<LittleEndian>(8).unwrap();      // Chunk header size
339        buf.write_u32::<LittleEndian>(128).unwrap();    // Chunk data size
340
341        buf.write_u32::<LittleEndian>(0).unwrap(); // string_count = 0
342        buf.write_u32::<LittleEndian>(0).unwrap(); // style_count = 0
343        buf.write_u32::<LittleEndian>(0).unwrap(); // flags
344        buf.write_u32::<LittleEndian>(32).unwrap(); // strings_start
345        buf.write_u32::<LittleEndian>(20).unwrap(); // styles_start
346
347        let mut buffer = Cursor::new(buf);
348
349        // The `from_buff` function assumes we have read the chunk type already
350        buffer.read_u16::<LittleEndian>().unwrap();
351
352        let mut global_strings = Vec::new();
353
354        let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
355
356        // Check that the string pool is correctly parsed and contains no strings
357        assert_eq!(string_pool.strings.len(), 0);
358    }
359
360    #[test]
361    fn test_utf8_string_parsing() {
362        // UTF-8 encoded string with length 5 (using mock data for simplicity)
363        let mut buf = Vec::new();
364
365        buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
366        buf.write_u16::<LittleEndian>(8).unwrap();      // Chunk header size
367        buf.write_u32::<LittleEndian>(128).unwrap();    // Chunk data size
368
369        buf.write_u32::<LittleEndian>(1).unwrap();      // string_count = 1
370        buf.write_u32::<LittleEndian>(0).unwrap();      // style_count = 0
371        buf.write_u32::<LittleEndian>(256).unwrap();    // flags (not sorted, utf8)
372        buf.write_u32::<LittleEndian>(32).unwrap();     // strings_start
373        buf.write_u32::<LittleEndian>(20).unwrap();     // styles_start
374        buf.write_u32::<LittleEndian>(0).unwrap();      // Offset of the string
375
376        buf.write_u8(0x05).unwrap();                    // UTF-8 string length
377        buf.write_u8(0x05).unwrap();                    // UTF-8 string decoded length
378        buf.write_all(b"Hello").unwrap();          // UTF-8 string data
379        buf.write_u8(0x00).unwrap();                    // Null terminator
380
381        let mut buffer = Cursor::new(buf);
382
383        // The `from_buff` function assumes we have read the chunk type already
384        buffer.read_u16::<LittleEndian>().unwrap();
385
386        let mut global_strings = Vec::new();
387
388        let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
389
390        // Validate that the string pool has correctly decoded the UTF-8 string
391        assert_eq!(string_pool.strings.len(), 1);
392        assert_eq!(string_pool.strings[0], "Hello");
393    }
394
395    #[test]
396    fn test_long_utf8_string_parsing() {
397        let mut buf = Vec::new();
398        let long_string = "A".repeat(150);
399        let string_char_count = 150u16; // 0x96
400        let string_byte_len = 150u16;   // 0x96
401
402        // Chunk header part of ResStringPool_header
403        let chunk_type_res_string_pool: u16 = 0x0001;
404        // let chunk_header_size_res_string_pool: u16 = 28; // Standard size for ResStringPool_header (ResChunk_header + ResStringPool_header specific fields)
405
406        // String pool specific header fields
407        let string_count_val: u32 = 1;
408        let style_count_val: u32 = 0;
409        let flags_val: u32 = 256; // UTF-8
410
411        // Calculate offsets and sizes
412        // Size of ResStringPool_header specific fields (string_count, style_count, flags, strings_start, styles_start)
413        let res_string_pool_specific_fields_size: u32 = 5 * 4; // 20 bytes
414        let string_offsets_array_size: u32 = string_count_val * 4; // Each offset is u32, 1 string = 4 bytes
415
416        // String entry: char_count_header (2 bytes) + byte_len_header (2 bytes) + string_itself (150 bytes) + null_terminator (1 byte)
417        let single_string_entry_size: u32 = 2 + 2 + string_byte_len as u32 + 1; // 155 bytes
418
419        let styles_start_val: u32 = 0; // No styles, so can be 0. This is an offset from chunk header.
420
421        // ResChunk_header.size: size of this chunk following the ResChunk_header (8 bytes).
422        // It includes: ResStringPool_header specific fields (20B) + string_offsets_array (4B) + string_data (155B)
423        let chunk_data_size_val: u32 = res_string_pool_specific_fields_size + string_offsets_array_size + single_string_entry_size; // 20 + 4 + 155 = 179 bytes
424
425
426        // ResChunk_header (8 bytes total)
427        buf.write_u16::<LittleEndian>(chunk_type_res_string_pool).unwrap(); // ChunkType (2B)
428        buf.write_u16::<LittleEndian>(8).unwrap(); // ResChunk_header.headerSize (2B) - size of this ResChunk_header
429        buf.write_u32::<LittleEndian>(chunk_data_size_val).unwrap();    // ResChunk_header.size (4B) - size of chunk data following this header
430
431        // ResStringPool_header specific fields (20 bytes total)
432        buf.write_u32::<LittleEndian>(string_count_val).unwrap(); // (4B)
433        buf.write_u32::<LittleEndian>(style_count_val).unwrap(); // (4B)
434        buf.write_u32::<LittleEndian>(flags_val).unwrap();       // (4B)
435        // ResStringPool_header.stringsStart: offset from start of ResChunk_header to string data.
436        // String data begins after: ResChunk_header (8B) + ResStringPool_header specific fields (20B) + string_offsets_array (4B)
437        let strings_start_field_val: u32 = 8 + res_string_pool_specific_fields_size + string_offsets_array_size; // 8 + 20 + 4 = 32
438        buf.write_u32::<LittleEndian>(strings_start_field_val).unwrap(); // stringsStart (4B)
439        buf.write_u32::<LittleEndian>(styles_start_val).unwrap();     // stylesStart (4B)
440
441        // String offsets array (4 bytes total for 1 string)
442        // Each entry is an offset from ResStringPool_header.stringsStart to the actual string entry.
443        // Since our string data immediately follows the string_offsets_array, and stringsStart points to the start of string data,
444        // the first string entry is at offset 0 from stringsStart.
445        buf.write_u32::<LittleEndian>(0).unwrap(); // Offset of the first string (relative to strings_start_field_val)
446
447        // String data
448        // Character count (150)
449        buf.write_u8(0x80 | ((string_char_count >> 8) & 0x7F) as u8).unwrap();
450        buf.write_u8((string_char_count & 0xFF) as u8).unwrap();
451
452        // Byte length (150)
453        buf.write_u8(0x80 | ((string_byte_len >> 8) & 0x7F) as u8).unwrap();
454        buf.write_u8((string_byte_len & 0xFF) as u8).unwrap();
455
456        buf.write_all(long_string.as_bytes()).unwrap();
457        buf.write_u8(0x00).unwrap();                    // Null terminator
458
459        let mut buffer = Cursor::new(buf);
460
461        // The `from_buff` function assumes we have read the chunk type already
462        buffer.read_u16::<LittleEndian>().unwrap(); // Consume chunk type
463
464        let mut global_strings = Vec::new();
465        let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
466
467        assert_eq!(string_pool.strings.len(), 1);
468        assert_eq!(string_pool.strings[0], long_string);
469        assert_eq!(string_pool.strings[0].len(), 150);
470        assert_eq!(string_pool.string_count, 1);
471        assert!(string_pool.is_utf8);
472    }
473}