rusty_axml/chunks/string_pool.rs
1#![allow(dead_code)]
2
3//! String pool
4//!
5//! The string pool is the set of strings used in the AXML files. All
6//! of these strings can be then referenced by the chunks. This reduces
7//! the size of the binary XML as there is no duplication of strings
8//! anymore.
9
10use crate::{
11 chunks::{
12 chunk_header::ChunkHeader,
13 chunk_types::ChunkType,
14 },
15 errors::AxmlError
16};
17
18use std::io::{
19 Read,
20 Cursor,
21};
22
23use byteorder::{
24 LittleEndian,
25 ReadBytesExt
26};
27
28/// String pool structure
29///
30/// The data of the string pool is an array of `u32` that provides the
31/// indices in the pool. The pool itself is located at `strings_start`
32/// offset. Each item of the pool is composed of:
33/// - the string length (16 bits, more details below)
34/// - the string (in UTF-16 format)
35/// - a terminator (`0x0000`)
36///
37/// The length is 16 bits long, but the system only uses 15 bits,
38/// which means that the maximum length of a string is 32,676
39/// characters. If a string has more than 32767 characters, the high
40/// bit of the length is set and the 15 remaining bits represent the
41/// high word of the total length. In this case, the length will be
42/// immediately followed by another 16 bits which represent the low
43/// end of the string length. This means the format allows for string
44/// lengths up to 2,147,483,648 characters.
45///
46/// If `style_count` is not zero, then immediately following the array
47/// of indices into the string table is another array of indices into
48/// a style table starting at `styles_start`. Each entry in the style
49/// table is an array of `string_pool_span` structures.
50#[derive(Debug)]
51pub struct StringPool {
52 /// Chunk header
53 header: ChunkHeader,
54
55 /// Number of strings in this pool (that is, number of `u32`
56 /// indices that follow in the data)
57 string_count: u32,
58
59 /// Number of style span arrays in the pool (that is, number
60 /// of `u32` indices follow the string indices)
61 style_count: u32,
62
63 /// Flags. There are two possible flags:
64 /// - `is_sorted`: if set, the string pool is sorted by
65 /// UTF-16 string values
66 /// - `is_utf8`: if set, the string pool is encoded in
67 /// UTF-8 and not UTF-16
68 is_sorted: bool,
69 is_utf8: bool,
70
71 /// Offset from the header to the string data
72 strings_start: u32,
73
74 /// Offset from the header to the style data
75 styles_start: u32,
76
77 /// List of strings offsets
78 strings_offsets: Vec<u32>,
79
80 /// List of styles offsets
81 styles_offsets: Vec<u32>,
82
83 /// The strings from the pool
84 strings: Vec<String>,
85
86 /// The styles from the pool
87 styles: Vec<StringPoolSpan>,
88}
89
90impl StringPool {
91 /// Parse the string pool from the raw data
92 pub fn from_buff(axml_buff: &mut Cursor<Vec<u8>>,
93 global_strings: &mut Vec<String>) -> Result<Self, AxmlError> {
94
95 // Go back 2 bytes, to account from the block type
96 let initial_offset = axml_buff.position() - 2;
97 axml_buff.set_position(initial_offset);
98 let initial_offset = initial_offset as u32;
99
100 // Parse chunk header
101 let header = ChunkHeader::from_buff(axml_buff, ChunkType::ResStringPoolType)?;
102
103 // Get remaining members
104 let string_count = axml_buff.read_u32::<LittleEndian>()?;
105 let style_count = axml_buff.read_u32::<LittleEndian>()?;
106 let flags = axml_buff.read_u32::<LittleEndian>()?;
107 let is_sorted = (flags & (1<<0)) != 0;
108 let is_utf8 = (flags & (1<<8)) != 0;
109 let strings_start = axml_buff.read_u32::<LittleEndian>()?;
110 let styles_start = axml_buff.read_u32::<LittleEndian>()?;
111
112 // Get strings offsets
113 let mut strings_offsets = Vec::new();
114 for _ in 0..string_count {
115 let offset = axml_buff.read_u32::<LittleEndian>()?;
116 strings_offsets.push(offset);
117 }
118
119 // Get styles offsets
120 let mut styles_offsets = Vec::new();
121 for _ in 0..style_count {
122 let offset = axml_buff.read_u32::<LittleEndian>()?;
123 styles_offsets.push(offset);
124 }
125
126 // Strings
127 for offset in strings_offsets.iter() {
128 // let current_start = (strings_start + offset + 8) as u64;
129 let current_start = (initial_offset + strings_start + offset) as u64;
130 axml_buff.set_position(current_start);
131
132 // let char_count: u16; // This will be handled per branch
133 let decoded_string: String;
134
135 if is_utf8 {
136 // Read UTF-8 character count (spec calls this "UTF-16 length")
137 // This is the number of characters, not bytes.
138 let mut first_byte_char_count = axml_buff.read_u8()? as u16;
139
140 // Mark as unused for now, but read it as per spec.
141 let _utf8_char_count = if (first_byte_char_count & 0x80) != 0 {
142 first_byte_char_count &= 0x7F; // Mask out the high bit
143 (first_byte_char_count << 8) | (axml_buff.read_u8()? as u16)
144 } else {
145 first_byte_char_count
146 };
147
148 // Read UTF-8 byte length (spec calls this "UTF-8 length")
149 let mut first_byte_byte_len = axml_buff.read_u8()? as u16;
150 // Renamed from _encoded_size
151 let byte_len: u16 = if (first_byte_byte_len & 0x80) != 0 {
152 first_byte_byte_len &= 0x7F; // Mask out the high bit
153 (first_byte_byte_len << 8) | (axml_buff.read_u8()? as u16)
154 } else {
155 first_byte_byte_len
156 };
157
158 // Use byte_len to read the string data
159 let mut str_buff = Vec::with_capacity(byte_len as usize);
160 let mut chunk = axml_buff.take(byte_len as u64);
161
162 chunk.read_to_end(&mut str_buff)?;
163 decoded_string = String::from_utf8(str_buff)?;
164 axml_buff.read_u8()?; // Consume UTF-8 null terminator (0x00)
165 // TODO: According to AOSP comments for resources.arsc, UTF-8 strings might have 2 or 4 byte terminators.
166 // This needs verification if parsing resources.arsc strings yields errors or incorrect cursor positions.
167 } else { // UTF-16
168 let char_count = axml_buff.read_u16::<LittleEndian>()?; // UTF-16 length in characters
169 let actual_decoded_string = if char_count > 0 {
170 let mut str_chars = Vec::with_capacity(char_count as usize);
171 for _ in 0..char_count {
172 // It's possible for a read error here if char_count is erroneously large.
173 // Consider adding error handling or further validation if issues arise.
174 str_chars.push(axml_buff.read_u16::<LittleEndian>()?);
175 }
176 std::char::decode_utf16(str_chars.into_iter())
177 .collect::<Result<String, _>>()?
178 } else {
179 String::new()
180 };
181 axml_buff.read_u16::<LittleEndian>()?; // Consume UTF-16 null terminator (0x0000)
182 decoded_string = actual_decoded_string;
183 }
184
185 // All strings, including empty ones, are added to the pool.
186 global_strings.push(decoded_string);
187 }
188
189 let mut styles = Vec::new();
190 for offset in styles_offsets.iter() {
191 let current_start = (initial_offset + strings_start + offset) as u64;
192 axml_buff.set_position(current_start);
193
194 let string_pool_ref = StringPoolRef {
195 index: axml_buff.read_u32::<LittleEndian>()?
196 };
197 let first_char = axml_buff.read_u32::<LittleEndian>()?;
198 let last_char = axml_buff.read_u32::<LittleEndian>()?;
199
200 styles.push(StringPoolSpan {
201 name: string_pool_ref,
202 first_char,
203 last_char
204 });
205 }
206
207 Ok(StringPool {
208 header,
209 string_count,
210 style_count,
211 is_sorted,
212 is_utf8,
213 strings_start,
214 styles_start,
215 strings_offsets,
216 styles_offsets,
217 strings: global_strings.to_vec(),
218 styles
219 })
220 }
221}
222
223/// Reference to a string in a string pool.
224#[derive(Debug)]
225struct StringPoolRef {
226 /// Index into the string pool table (uint32_t-offset from the indices
227 /// immediately after ResStringPool_header) at which to find the location
228 /// of the string data in the pool.
229 index: u32,
230}
231
232/// String pool span
233///
234/// This structure defines a span of style information associated
235/// with a string in the pool.
236#[derive(Debug)]
237struct StringPoolSpan {
238 /// Name of the span
239 ///
240 /// This is the name of the XML tag that defined it.
241 /// There is a special value END (0xFFFFFFFF) that indicates the
242 /// end of an array of spans.
243 name: StringPoolRef,
244
245 /// The first of the characters in the string that this span applies to
246 first_char: u32,
247
248 /// The last of the characters in the string that this span applies to
249 last_char: u32,
250}
251
252#[cfg(test)]
253mod tests {
254 use super::*;
255 use std::io::{ Cursor, Write };
256 use byteorder::{LittleEndian, WriteBytesExt};
257
258 // Helper function to create a simple buffer for testing
259 fn create_test_buffer() -> Cursor<Vec<u8>> {
260 let mut buf = Vec::new();
261
262 // Chunk header
263 buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
264 buf.write_u16::<LittleEndian>(8).unwrap(); // Chunk header size
265 buf.write_u32::<LittleEndian>(128).unwrap(); // Chunk data size
266
267 // String pool header
268 buf.write_u32::<LittleEndian>(2).unwrap(); // string_count
269 buf.write_u32::<LittleEndian>(0).unwrap(); // style_count
270 buf.write_u32::<LittleEndian>(1).unwrap(); // flags (is_sorted, not is_utf8)
271 buf.write_u32::<LittleEndian>(36).unwrap(); // strings_start
272 buf.write_u32::<LittleEndian>(20).unwrap(); // styles_start
273 buf.write_u32::<LittleEndian>(0).unwrap(); // first string offset
274 buf.write_u32::<LittleEndian>(14).unwrap(); // second string offset
275
276 // Add mock string offsets and string data
277 buf.write_u16::<LittleEndian>(5).unwrap(); // Length of first string (UTF-16)
278 buf.write_u16::<LittleEndian>(0x0048).unwrap(); // 'H'
279 buf.write_u16::<LittleEndian>(0x0065).unwrap(); // 'e'
280 buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
281 buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
282 buf.write_u16::<LittleEndian>(0x006F).unwrap(); // 'o'
283 buf.write_u16::<LittleEndian>(0x0000).unwrap(); // Null terminator
284
285 buf.write_u16::<LittleEndian>(5).unwrap(); // Length of second string (UTF-16)
286 buf.write_u16::<LittleEndian>(0x0057).unwrap(); // 'W'
287 buf.write_u16::<LittleEndian>(0x006F).unwrap(); // 'o'
288 buf.write_u16::<LittleEndian>(0x0072).unwrap(); // 'r'
289 buf.write_u16::<LittleEndian>(0x006C).unwrap(); // 'l'
290 buf.write_u16::<LittleEndian>(0x0064).unwrap(); // 'd'
291 buf.write_u16::<LittleEndian>(0x0000).unwrap(); // Null terminator
292
293 Cursor::new(buf)
294 }
295
296 #[test]
297 fn test_string_pool_parse_utf16() {
298 // Create a test buffer
299 let mut buffer = create_test_buffer();
300
301 // The `from_buff` function assumes we have read the chunk type already
302 buffer.read_u16::<LittleEndian>().unwrap();
303
304 let mut global_strings = Vec::new();
305
306 // Parse string pool from buffer
307 let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
308
309 // Validate that the string pool is parsed correctly
310 assert_eq!(string_pool.strings.len(), 2);
311 assert_eq!(string_pool.strings[0], "Hello");
312 assert_eq!(string_pool.strings[1], "World");
313 }
314
315 #[test]
316 fn test_string_pool_flags() {
317 let mut buffer = create_test_buffer();
318
319 // The `from_buff` function assumes we have read the chunk type already
320 buffer.read_u16::<LittleEndian>().unwrap();
321
322 let mut global_strings = Vec::new();
323
324 // Parse string pool from buffer
325 let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
326
327 // Validate the flags
328 assert!(string_pool.is_sorted);
329 assert!(!string_pool.is_utf8);
330 }
331
332 #[test]
333 fn test_empty_pool() {
334 // Test case with no strings in the pool
335 let mut buf = Vec::new();
336
337 buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
338 buf.write_u16::<LittleEndian>(8).unwrap(); // Chunk header size
339 buf.write_u32::<LittleEndian>(128).unwrap(); // Chunk data size
340
341 buf.write_u32::<LittleEndian>(0).unwrap(); // string_count = 0
342 buf.write_u32::<LittleEndian>(0).unwrap(); // style_count = 0
343 buf.write_u32::<LittleEndian>(0).unwrap(); // flags
344 buf.write_u32::<LittleEndian>(32).unwrap(); // strings_start
345 buf.write_u32::<LittleEndian>(20).unwrap(); // styles_start
346
347 let mut buffer = Cursor::new(buf);
348
349 // The `from_buff` function assumes we have read the chunk type already
350 buffer.read_u16::<LittleEndian>().unwrap();
351
352 let mut global_strings = Vec::new();
353
354 let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
355
356 // Check that the string pool is correctly parsed and contains no strings
357 assert_eq!(string_pool.strings.len(), 0);
358 }
359
360 #[test]
361 fn test_utf8_string_parsing() {
362 // UTF-8 encoded string with length 5 (using mock data for simplicity)
363 let mut buf = Vec::new();
364
365 buf.write_u16::<LittleEndian>(0x0001).unwrap(); // ChunkType::ResStringPoolType
366 buf.write_u16::<LittleEndian>(8).unwrap(); // Chunk header size
367 buf.write_u32::<LittleEndian>(128).unwrap(); // Chunk data size
368
369 buf.write_u32::<LittleEndian>(1).unwrap(); // string_count = 1
370 buf.write_u32::<LittleEndian>(0).unwrap(); // style_count = 0
371 buf.write_u32::<LittleEndian>(256).unwrap(); // flags (not sorted, utf8)
372 buf.write_u32::<LittleEndian>(32).unwrap(); // strings_start
373 buf.write_u32::<LittleEndian>(20).unwrap(); // styles_start
374 buf.write_u32::<LittleEndian>(0).unwrap(); // Offset of the string
375
376 buf.write_u8(0x05).unwrap(); // UTF-8 string length
377 buf.write_u8(0x05).unwrap(); // UTF-8 string decoded length
378 buf.write_all(b"Hello").unwrap(); // UTF-8 string data
379 buf.write_u8(0x00).unwrap(); // Null terminator
380
381 let mut buffer = Cursor::new(buf);
382
383 // The `from_buff` function assumes we have read the chunk type already
384 buffer.read_u16::<LittleEndian>().unwrap();
385
386 let mut global_strings = Vec::new();
387
388 let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
389
390 // Validate that the string pool has correctly decoded the UTF-8 string
391 assert_eq!(string_pool.strings.len(), 1);
392 assert_eq!(string_pool.strings[0], "Hello");
393 }
394
395 #[test]
396 fn test_long_utf8_string_parsing() {
397 let mut buf = Vec::new();
398 let long_string = "A".repeat(150);
399 let string_char_count = 150u16; // 0x96
400 let string_byte_len = 150u16; // 0x96
401
402 // Chunk header part of ResStringPool_header
403 let chunk_type_res_string_pool: u16 = 0x0001;
404 // let chunk_header_size_res_string_pool: u16 = 28; // Standard size for ResStringPool_header (ResChunk_header + ResStringPool_header specific fields)
405
406 // String pool specific header fields
407 let string_count_val: u32 = 1;
408 let style_count_val: u32 = 0;
409 let flags_val: u32 = 256; // UTF-8
410
411 // Calculate offsets and sizes
412 // Size of ResStringPool_header specific fields (string_count, style_count, flags, strings_start, styles_start)
413 let res_string_pool_specific_fields_size: u32 = 5 * 4; // 20 bytes
414 let string_offsets_array_size: u32 = string_count_val * 4; // Each offset is u32, 1 string = 4 bytes
415
416 // String entry: char_count_header (2 bytes) + byte_len_header (2 bytes) + string_itself (150 bytes) + null_terminator (1 byte)
417 let single_string_entry_size: u32 = 2 + 2 + string_byte_len as u32 + 1; // 155 bytes
418
419 let styles_start_val: u32 = 0; // No styles, so can be 0. This is an offset from chunk header.
420
421 // ResChunk_header.size: size of this chunk following the ResChunk_header (8 bytes).
422 // It includes: ResStringPool_header specific fields (20B) + string_offsets_array (4B) + string_data (155B)
423 let chunk_data_size_val: u32 = res_string_pool_specific_fields_size + string_offsets_array_size + single_string_entry_size; // 20 + 4 + 155 = 179 bytes
424
425
426 // ResChunk_header (8 bytes total)
427 buf.write_u16::<LittleEndian>(chunk_type_res_string_pool).unwrap(); // ChunkType (2B)
428 buf.write_u16::<LittleEndian>(8).unwrap(); // ResChunk_header.headerSize (2B) - size of this ResChunk_header
429 buf.write_u32::<LittleEndian>(chunk_data_size_val).unwrap(); // ResChunk_header.size (4B) - size of chunk data following this header
430
431 // ResStringPool_header specific fields (20 bytes total)
432 buf.write_u32::<LittleEndian>(string_count_val).unwrap(); // (4B)
433 buf.write_u32::<LittleEndian>(style_count_val).unwrap(); // (4B)
434 buf.write_u32::<LittleEndian>(flags_val).unwrap(); // (4B)
435 // ResStringPool_header.stringsStart: offset from start of ResChunk_header to string data.
436 // String data begins after: ResChunk_header (8B) + ResStringPool_header specific fields (20B) + string_offsets_array (4B)
437 let strings_start_field_val: u32 = 8 + res_string_pool_specific_fields_size + string_offsets_array_size; // 8 + 20 + 4 = 32
438 buf.write_u32::<LittleEndian>(strings_start_field_val).unwrap(); // stringsStart (4B)
439 buf.write_u32::<LittleEndian>(styles_start_val).unwrap(); // stylesStart (4B)
440
441 // String offsets array (4 bytes total for 1 string)
442 // Each entry is an offset from ResStringPool_header.stringsStart to the actual string entry.
443 // Since our string data immediately follows the string_offsets_array, and stringsStart points to the start of string data,
444 // the first string entry is at offset 0 from stringsStart.
445 buf.write_u32::<LittleEndian>(0).unwrap(); // Offset of the first string (relative to strings_start_field_val)
446
447 // String data
448 // Character count (150)
449 buf.write_u8(0x80 | ((string_char_count >> 8) & 0x7F) as u8).unwrap();
450 buf.write_u8((string_char_count & 0xFF) as u8).unwrap();
451
452 // Byte length (150)
453 buf.write_u8(0x80 | ((string_byte_len >> 8) & 0x7F) as u8).unwrap();
454 buf.write_u8((string_byte_len & 0xFF) as u8).unwrap();
455
456 buf.write_all(long_string.as_bytes()).unwrap();
457 buf.write_u8(0x00).unwrap(); // Null terminator
458
459 let mut buffer = Cursor::new(buf);
460
461 // The `from_buff` function assumes we have read the chunk type already
462 buffer.read_u16::<LittleEndian>().unwrap(); // Consume chunk type
463
464 let mut global_strings = Vec::new();
465 let string_pool = StringPool::from_buff(&mut buffer, &mut global_strings).unwrap();
466
467 assert_eq!(string_pool.strings.len(), 1);
468 assert_eq!(string_pool.strings[0], long_string);
469 assert_eq!(string_pool.strings[0].len(), 150);
470 assert_eq!(string_pool.string_count, 1);
471 assert!(string_pool.is_utf8);
472 }
473}