swamp_vm/
string.rs

1/*
2 * Copyright (c) Peter Bjorklund. All rights reserved. https://github.com/swamp/swamp
3 * Licensed under the MIT License. See LICENSE in the project root for license information.
4 */
5use crate::memory::{ExecutionMode, Memory};
6use crate::{TrapCode, Vm, get_reg, i16_from_u8s, set_reg};
7use std::{mem::size_of, ptr};
8use swamp_vm_types::{
9    MAX_STRING_LEN, StringIterator, VEC_HEADER_MAGIC_CODE, VEC_HEADER_PAYLOAD_OFFSET, VecHeader,
10};
11
12impl Vm {
13    pub fn get_string_iterator_header_ptr_from_reg(
14        &self,
15        vec_iterator_reg: u8,
16    ) -> *mut StringIterator {
17        self.get_ptr_from_reg(vec_iterator_reg) as *mut StringIterator
18    }
19
20    #[inline]
21    fn get_string(&self, reg: u8) -> &str {
22        let string_header_addr = get_reg!(self, reg);
23        let header_ptr =
24            self.memory()
25                .get_heap_const_ptr(string_header_addr as usize) as *const VecHeader;
26        let header = unsafe { *header_ptr };
27        let byte_count = header.element_count;
28
29        #[cfg(feature = "debug_vm")]
30        if self.debug_operations_enabled {
31            eprintln!(
32                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
33                self.memory().constant_memory_size,
34                self.memory().stack_start,
35                self.memory().stack_offset,
36                self.memory().heap_start,
37                self.memory().heap_alloc_offset
38            );
39        }
40
41        if byte_count != 0 {
42            debug_assert_eq!(
43                header.padding, VEC_HEADER_MAGIC_CODE,
44                "string is corrupt. it is saying it has length {byte_count}, left: {}, right: {VEC_HEADER_MAGIC_CODE}",
45                header.padding
46            );
47            debug_assert!(
48                header.element_count < MAX_STRING_LEN,
49                "string of strange length. it is saying it has length {byte_count}, left: {}, right: {VEC_HEADER_MAGIC_CODE}",
50                header.padding
51            );
52        }
53
54        let runes_ptr = self.memory().get_heap_const_ptr(
55            (string_header_addr as usize) + VEC_HEADER_PAYLOAD_OFFSET.0 as usize,
56        );
57
58        unsafe {
59            let bytes = std::slice::from_raw_parts(runes_ptr, byte_count as usize);
60
61            if byte_count > 0 {
62                let s = std::str::from_utf8(bytes).unwrap_or("INVALID_UTF8");
63                #[cfg(feature = "debug_vm")]
64                if self.debug_operations_enabled {
65                    eprintln!("String content: \"{s}\"");
66                }
67            }
68
69            std::str::from_utf8_unchecked(bytes)
70        }
71    }
72    #[inline]
73    pub fn execute_string_append(&mut self, target_string_reg: u8, string_a: u8, string_b: u8) {
74        #[cfg(feature = "debug_vm")]
75        if self.debug_operations_enabled {
76            eprintln!("=== STRING_APPEND OPERATION ===");
77            eprintln!(
78                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
79                self.memory().constant_memory_size,
80                self.memory().stack_start,
81                self.memory().stack_offset,
82                self.memory().heap_start,
83                self.memory().heap_alloc_offset
84            );
85
86            // Debug: Print register values using the get_reg macro
87            let reg_a_value = get_reg!(self, string_a);
88            let reg_b_value = get_reg!(self, string_b);
89
90            eprintln!("String A register {string_a}: 0x{reg_a_value:X}");
91            eprintln!("String B register {string_b}: 0x{reg_b_value:X}");
92            eprintln!("Target register {target_string_reg}");
93        }
94
95        let str_a = self.get_string(string_a);
96        let str_b = self.get_string(string_b);
97
98        let result = str_a.to_string() + str_b;
99
100        #[cfg(feature = "debug_vm")]
101        if self.debug_operations_enabled {
102            eprintln!(
103                "Concatenated string: \"{}\" (length: {})",
104                result,
105                result.len()
106            );
107        }
108
109        self.create_string(target_string_reg, &result);
110        // Debug: Print final register value
111        let final_reg_value = get_reg!(self, target_string_reg);
112        #[cfg(feature = "debug_vm")]
113        if self.debug_operations_enabled {
114            eprintln!("Final target register value: 0x{final_reg_value:X}");
115        }
116    }
117
118    #[inline]
119    pub fn execute_string_cmp(&mut self, dest_reg: u8, string_a: u8, string_b: u8) {
120        #[cfg(feature = "debug_vm")]
121        if self.debug_operations_enabled {
122            eprintln!("=== STRING_COMPARE OPERATION ===");
123            eprintln!(
124                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
125                self.memory().constant_memory_size,
126                self.memory().stack_start,
127                self.memory().stack_offset,
128                self.memory().heap_start,
129                self.memory().heap_alloc_offset
130            );
131        }
132
133        // Debug: Print register values
134        let reg_a_value = get_reg!(self, string_a);
135        let reg_b_value = get_reg!(self, string_b);
136
137        #[cfg(feature = "debug_vm")]
138        if self.debug_operations_enabled {
139            eprintln!("String A register {string_a}: 0x{reg_a_value:X}");
140            eprintln!("String B register {string_b}: 0x{reg_b_value:X}");
141        }
142
143        let str_a = self.get_string(string_a);
144        let str_b = self.get_string(string_b);
145
146        let result = str_a == str_b;
147
148        #[cfg(feature = "debug_vm")]
149        if self.debug_operations_enabled {
150            eprintln!("String comparison result: {result}");
151        }
152
153        // Store the result
154        set_reg!(self, dest_reg, result as u32);
155    }
156
157    /// Return the same string but with quotes.
158    #[inline]
159    pub fn execute_string_to_string(&mut self, dest_reg: u8, source_string: u8) {
160        #[cfg(feature = "debug_vm")]
161        if self.debug_operations_enabled {
162            eprintln!("=== STRING_TO_STRING OPERATION ===");
163            eprintln!(
164                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
165                self.memory().constant_memory_size,
166                self.memory().stack_start,
167                self.memory().stack_offset,
168                self.memory().heap_start,
169                self.memory().heap_alloc_offset
170            );
171
172            let source_reg_value = get_reg!(self, source_string);
173            eprintln!("Source string register {source_string}: 0x{source_reg_value:X}");
174        }
175
176        let source_str = self.get_string(source_string);
177
178        // Create the formatted string with quotes
179        let mut formatted_string = String::with_capacity(source_str.len() + 2);
180        formatted_string.push('"');
181        formatted_string.push_str(source_str);
182        formatted_string.push('"');
183
184        #[cfg(feature = "debug_vm")]
185        if self.debug_operations_enabled {
186            eprintln!(
187                "Formatted string: \"{}\" (length: {})",
188                formatted_string,
189                formatted_string.len()
190            );
191        }
192
193        self.create_string(dest_reg, &formatted_string);
194
195        let final_reg_value = get_reg!(self, dest_reg);
196
197        #[cfg(feature = "debug_vm")]
198        if self.debug_operations_enabled {
199            eprintln!("Final destination register value: 0x{final_reg_value:X}");
200        }
201    }
202
203    pub fn read_string(&self, heap_addr: u32, heap: &Memory) -> &str {
204        let string_header_ptr = heap.get_heap_const_ptr(heap_addr as usize) as *const VecHeader;
205        let string_header = unsafe { *string_header_ptr };
206
207        #[cfg(feature = "debug_vm")]
208        if self.debug_operations_enabled {
209            eprintln!(
210                "read_string: addr=0x{heap_addr:X}, capacity={}, byte_count={}, padding=0x{:X}",
211                string_header.capacity, string_header.element_count, string_header.padding
212            );
213        }
214
215        let byte_count = string_header.element_count as usize;
216
217        #[cfg(feature = "debug_vm")]
218        if string_header.element_count != 0 {
219            debug_assert_eq!(
220                string_header.padding, VEC_HEADER_MAGIC_CODE,
221                "CORRUPTION DETECTED in read_string: String header at 0x{heap_addr:X} has invalid padding 0x{:X}, should be 0x{VEC_HEADER_MAGIC_CODE:X}",
222                string_header.padding
223            );
224            debug_assert_eq!(
225                string_header.capacity, string_header.element_count,
226                "Corruption. strings should never change"
227            );
228            // TODO: just a hack for now to see if it is plausible.
229            debug_assert!(
230                byte_count < 512,
231                "Strange. string byte_count {byte_count} is unreasonably large"
232            );
233        }
234
235        // String data follows directly after the header
236        let string_data_ptr = unsafe {
237            heap.get_heap_const_ptr(heap_addr as usize + VEC_HEADER_PAYLOAD_OFFSET.0 as usize)
238        };
239
240        unsafe {
241            let bytes = std::slice::from_raw_parts(string_data_ptr, byte_count);
242            match std::str::from_utf8(bytes) {
243                Ok(s) => s,
244                Err(e) => {
245                    panic!("ERROR: Invalid UTF-8 string data at 0x{heap_addr:X}: {e}");
246                    ""
247                }
248            }
249        }
250    }
251
252    /// Strings are immutable, can not be altered after they have been created.
253    /// They can be safely shared and the pointer can be blittable when inside composite types.
254    /// The string data is stored directly after the header in memory.
255    pub(crate) fn create_string(&mut self, dst_reg: u8, string: &str) {
256        let rune_bytes = string.as_bytes();
257        let byte_count = rune_bytes.len();
258        let cap_bytes = if byte_count == 0 { 1 } else { byte_count };
259
260        debug_assert!(
261            byte_count <= MAX_STRING_LEN as usize,
262            "String too large: {byte_count} bytes"
263        );
264
265        // Calculate total size needed: header + string data
266        // We assume that StringHeader is aligned to u32
267        let total_size = size_of::<VecHeader>() + byte_count;
268
269        let header_addr_in_heap = self.memory.heap_allocate_secret(total_size);
270
271        #[cfg(feature = "debug_vm")]
272        match self.memory.execution_mode {
273            ExecutionMode::ConstantEvaluation => {
274                // In constant evaluation, strings should be in heap which is directly after constant area
275                debug_assert!(
276                    header_addr_in_heap >= self.memory.heap_start as u32
277                        && header_addr_in_heap < self.memory.heap_alloc_offset as u32,
278                    "String allocation at 0x{header_addr_in_heap:X} is not in heap during constant evaluation",
279                );
280            }
281            ExecutionMode::NormalExecution => {
282                // In normal execution, strings should be in heap which is after stack
283                debug_assert!(
284                    header_addr_in_heap >= self.memory.heap_start as u32
285                        && header_addr_in_heap < self.memory.heap_alloc_offset as u32,
286                    "String allocation at 0x{header_addr_in_heap:X} is not in heap during normal execution",
287                );
288            }
289        }
290
291        let string_header = VecHeader {
292            capacity: cap_bytes as u16,
293            element_count: byte_count as u16,
294            element_size: 1,
295            padding: VEC_HEADER_MAGIC_CODE,
296        };
297
298        unsafe {
299            let header_ptr =
300                self.memory.get_heap_ptr(header_addr_in_heap as usize) as *mut VecHeader;
301            ptr::write(header_ptr, string_header);
302
303            let string_data_ptr = self
304                .memory
305                .get_heap_ptr(header_addr_in_heap as usize + VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
306            ptr::copy_nonoverlapping(rune_bytes.as_ptr(), string_data_ptr, byte_count);
307        }
308
309        #[cfg(feature = "debug_vm")]
310        if self.debug_operations_enabled {
311            eprintln!(
312                "Creating string: '{string}', header at 0x{header_addr_in_heap:X}, capacity={byte_count}, byte_count={byte_count}, padding=0x{VEC_HEADER_MAGIC_CODE:X}"
313            );
314        }
315
316        set_reg!(self, dst_reg, header_addr_in_heap);
317    }
318
319    #[inline]
320    pub fn execute_string_iter_init(
321        &mut self,
322        target_string_iterator_header_reg: u8,
323        string_header_reg: u8,
324    ) {
325        let string_header_addr = get_reg!(self, string_header_reg);
326
327        // Check that vec header is correct
328        let string_header_ptr = self
329            .memory
330            .get_heap_const_ptr(string_header_addr as usize)
331            .cast::<VecHeader>();
332        let string_header = unsafe { &*string_header_ptr };
333
334        if string_header.padding != VEC_HEADER_MAGIC_CODE {
335            return self.internal_trap(TrapCode::MemoryCorruption);
336        }
337        if string_header.capacity == 0 {
338            return self.internal_trap(TrapCode::VecNeverInitialized);
339        }
340
341        #[cfg(feature = "debug_vm")]
342        if self.debug_operations_enabled {
343            let iter_addr = get_reg!(self, target_string_iterator_header_reg);
344            eprintln!(
345                "string_iter_init: iter_addr: {iter_addr:04X} string_header_addr:{string_header_addr:04X} element_size: {}",
346                string_header.element_size
347            );
348        }
349        let string_iterator = StringIterator {
350            string_heap_ptr: string_header_addr,
351            byte_index: 0,
352            index: 0,
353        };
354
355        let string_iterator_mut_ptr =
356            self.get_ptr_from_reg(target_string_iterator_header_reg) as *mut StringIterator;
357
358        unsafe {
359            ptr::write(string_iterator_mut_ptr, string_iterator);
360        }
361    }
362
363    #[inline]
364    pub fn execute_string_iter_next(
365        &mut self,
366        string_iterator_header_reg: u8,
367        target_variable: u8,
368        branch_offset_lower: u8,
369        branch_offset_upper: u8,
370    ) {
371        let string_iterator =
372            self.get_string_iterator_header_ptr_from_reg(string_iterator_header_reg);
373
374        unsafe {
375            let string_header_addr = (*string_iterator).string_heap_ptr;
376            let string_header_ptr = self
377                .memory
378                .get_heap_const_ptr(string_header_addr as usize)
379                .cast::<VecHeader>();
380
381            let string_header_raw_ptr = self.memory.get_heap_const_ptr(string_header_addr as usize);
382
383            let string_header = &*string_header_ptr;
384            if string_header.padding != VEC_HEADER_MAGIC_CODE {
385                return self.internal_trap(TrapCode::MemoryCorruption);
386            }
387
388            #[cfg(feature = "debug_vm")]
389            if self.debug_operations_enabled {
390                let iter_addr = get_reg!(self, string_iterator_header_reg);
391                let index = (*string_iterator).byte_index;
392                eprintln!(
393                    "string_iter_next: iter_addr: {iter_addr:04X} addr:{string_header_addr:04X} index:{index} len: {}, capacity: {}",
394                    string_header.element_count, string_header.capacity
395                );
396            }
397
398            // Check if we've reached the end
399            if (*string_iterator).byte_index >= string_header.element_count {
400                // Jump to the provided address if we're done
401                let branch_offset = i16_from_u8s!(branch_offset_lower, branch_offset_upper);
402
403                #[cfg(feature = "debug_vm")]
404                {
405                    if self.debug_operations_enabled {
406                        eprintln!("string_iter_next complete. jumping with offset {branch_offset}");
407                    }
408                }
409
410                self.pc = (self.pc as i32 + branch_offset as i32) as usize;
411
412                return;
413            }
414
415            let current_byte_index = (*string_iterator).byte_index as usize;
416            let remaining_byte_count = (string_header.element_count as usize) - current_byte_index;
417            let payload_ptr = string_header_raw_ptr.add(VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
418
419            let remaining_bytes = std::slice::from_raw_parts(
420                payload_ptr.add(current_byte_index),
421                remaining_byte_count,
422            );
423
424            match std::str::from_utf8(remaining_bytes) {
425                Ok(valid_str) => {
426                    if let Some(c) = valid_str.chars().next() {
427                        // Place the decoded character (a Char - u32) into the target register
428                        // Advance the iterator by the actual byte width of the character
429                        let advancement = c.len_utf8() as u16;
430                        (*string_iterator).byte_index += advancement;
431
432                        let raw_u32 = c as u32;
433                        set_reg!(self, target_variable, raw_u32);
434                    } else {
435                        self.internal_trap(TrapCode::InvalidUtf8Sequence);
436                    }
437                }
438                Err(_) => {
439                    // The string data in the VM memory is corrupted/invalid
440                    self.internal_trap(TrapCode::InvalidUtf8Sequence);
441                }
442            }
443        }
444    }
445
446    #[inline]
447    pub fn execute_string_iter_next_pair(
448        &mut self,
449        string_iterator_header_reg: u8,
450        target_key_reg: u8,
451        target_value_reg: u8,
452        branch_offset_lower: u8,
453        branch_offset_upper: u8,
454    ) {
455        let string_iterator =
456            self.get_string_iterator_header_ptr_from_reg(string_iterator_header_reg);
457
458        unsafe {
459            let string_header_addr = (*string_iterator).string_heap_ptr;
460            let string_header_ptr = self
461                .memory
462                .get_heap_const_ptr(string_header_addr as usize)
463                .cast::<VecHeader>();
464
465            let string_header_raw_ptr = self.memory.get_heap_const_ptr(string_header_addr as usize);
466
467            let string_header = &*string_header_ptr;
468            if string_header.padding != VEC_HEADER_MAGIC_CODE {
469                return self.internal_trap(TrapCode::MemoryCorruption);
470            }
471
472            #[cfg(feature = "debug_vm")]
473            if self.debug_operations_enabled {
474                let iter_addr = get_reg!(self, string_iterator_header_reg);
475                let index = (*string_iterator).byte_index;
476                eprintln!(
477                    "string_iter_next: iter_addr: {iter_addr:04X} addr:{string_header_addr:04X} index:{index} len: {}, capacity: {}",
478                    string_header.element_count, string_header.capacity
479                );
480            }
481
482            // Check if we've reached the end
483            if (*string_iterator).byte_index >= string_header.element_count {
484                // Jump to the provided address if we're done
485                let branch_offset = i16_from_u8s!(branch_offset_lower, branch_offset_upper);
486
487                #[cfg(feature = "debug_vm")]
488                {
489                    if self.debug_operations_enabled {
490                        eprintln!("string_iter_next complete. jumping with offset {branch_offset}");
491                    }
492                }
493
494                self.pc = (self.pc as i32 + branch_offset as i32) as usize;
495
496                return;
497            }
498
499            let current_byte_index = (*string_iterator).byte_index as usize;
500            let remaining_byte_count = (string_header.element_count as usize) - current_byte_index;
501            let payload_ptr = string_header_raw_ptr.add(VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
502
503            let remaining_bytes = std::slice::from_raw_parts(
504                payload_ptr.add(current_byte_index),
505                remaining_byte_count,
506            );
507
508            match std::str::from_utf8(remaining_bytes) {
509                Ok(valid_str) => {
510                    if let Some(c) = valid_str.chars().next() {
511                        // Place the decoded character (a Char - u32) into the target register
512                        // Advance the iterator by the actual byte width of the character
513                        let advancement = c.len_utf8() as u16;
514                        (*string_iterator).byte_index += advancement;
515
516                        let raw_u32 = c as u32;
517                        eprintln!(
518                            "raw: {raw_u32} advancement {advancement} -> r{target_value_reg}"
519                        );
520                        set_reg!(self, target_key_reg, (*string_iterator).index);
521                        set_reg!(self, target_value_reg, raw_u32);
522
523                        (*string_iterator).index += 1;
524                    } else {
525                        self.internal_trap(TrapCode::InvalidUtf8Sequence);
526                    }
527                }
528                Err(_) => {
529                    // The string data in the VM memory is corrupted/invalid
530                    self.internal_trap(TrapCode::InvalidUtf8Sequence);
531                }
532            }
533        }
534    }
535}