swamp_vm/
string.rs

1/*
2 * Copyright (c) Peter Bjorklund. All rights reserved. https://github.com/swamp/swamp
3 * Licensed under the MIT License. See LICENSE in the project root for license information.
4 */
5use crate::memory::{ExecutionMode, Memory};
6use crate::{get_reg, i16_from_u8s, set_reg, TrapCode, Vm};
7use std::{mem::size_of, ptr};
8use swamp_vm_types::{
9    StringIterator, VecHeader, MAX_STRING_LEN, VEC_HEADER_MAGIC_CODE, VEC_HEADER_PAYLOAD_OFFSET,
10};
11
12impl Vm {
13    pub fn get_string_iterator_header_ptr_from_reg(
14        &self,
15        vec_iterator_reg: u8,
16    ) -> *mut StringIterator {
17        self.get_ptr_from_reg(vec_iterator_reg) as *mut StringIterator
18    }
19
20    #[inline]
21    fn get_string(&self, reg: u8) -> &str {
22        let string_header_addr = get_reg!(self, reg);
23        let header_ptr =
24            self.memory()
25                .get_heap_const_ptr(string_header_addr as usize) as *const VecHeader;
26        let header = unsafe { *header_ptr };
27        let byte_count = header.element_count;
28
29        #[cfg(feature = "debug_vm")]
30        if self.debug_operations_enabled {
31            eprintln!(
32                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
33                self.memory().constant_memory_size,
34                self.memory().stack_start,
35                self.memory().stack_offset,
36                self.memory().heap_start,
37                self.memory().heap_alloc_offset
38            );
39        }
40
41        if byte_count != 0 {
42            debug_assert_eq!(
43                header.padding, VEC_HEADER_MAGIC_CODE,
44                "string is corrupt. it is saying it has length {byte_count}, left: {}, right: {VEC_HEADER_MAGIC_CODE}",
45                header.padding
46            );
47            debug_assert!(
48                header.element_count < MAX_STRING_LEN,
49                "string of strange length. it is saying it has length {byte_count}, left: {}, right: {VEC_HEADER_MAGIC_CODE}",
50                header.padding
51            );
52        }
53
54        let runes_ptr = self.memory().get_heap_const_ptr(
55            (string_header_addr as usize) + VEC_HEADER_PAYLOAD_OFFSET.0 as usize,
56        );
57
58        unsafe {
59            let bytes = std::slice::from_raw_parts(runes_ptr, byte_count as usize);
60
61            if byte_count > 0 {
62                let s = std::str::from_utf8(bytes).unwrap_or("INVALID_UTF8");
63                #[cfg(feature = "debug_vm")]
64                if self.debug_operations_enabled {
65                    eprintln!("String content: \"{s}\"");
66                }
67            }
68
69            std::str::from_utf8_unchecked(bytes)
70        }
71    }
72    #[inline]
73    pub fn execute_string_append(&mut self, target_string_reg: u8, string_a: u8, string_b: u8) {
74        #[cfg(feature = "debug_vm")]
75        if self.debug_operations_enabled {
76            eprintln!("=== STRING_APPEND OPERATION ===");
77            eprintln!(
78                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
79                self.memory().constant_memory_size,
80                self.memory().stack_start,
81                self.memory().stack_offset,
82                self.memory().heap_start,
83                self.memory().heap_alloc_offset
84            );
85
86            // Debug: Print register values using the get_reg macro
87            let reg_a_value = get_reg!(self, string_a);
88            let reg_b_value = get_reg!(self, string_b);
89
90            eprintln!("String A register {string_a}: 0x{reg_a_value:X}");
91            eprintln!("String B register {string_b}: 0x{reg_b_value:X}");
92            eprintln!("Target register {target_string_reg}");
93        }
94
95        let str_a = self.get_string(string_a);
96        let str_b = self.get_string(string_b);
97
98        let result = str_a.to_string() + str_b;
99
100        #[cfg(feature = "debug_vm")]
101        if self.debug_operations_enabled {
102            eprintln!(
103                "Concatenated string: \"{}\" (length: {})",
104                result,
105                result.len()
106            );
107        }
108
109        self.create_string(target_string_reg, &result);
110        // Debug: Print final register value
111        let final_reg_value = get_reg!(self, target_string_reg);
112        #[cfg(feature = "debug_vm")]
113        if self.debug_operations_enabled {
114            eprintln!("Final target register value: 0x{final_reg_value:X}");
115        }
116    }
117
118    #[inline]
119    pub fn execute_string_repeat(&mut self, target_string_reg: u8, string_a: u8, repeat_reg: u8) {
120        #[cfg(feature = "debug_vm")]
121        if self.debug_operations_enabled {
122            eprintln!("=== STRING_REPEAT OPERATION ===");
123            eprintln!(
124                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
125                self.memory().constant_memory_size,
126                self.memory().stack_start,
127                self.memory().stack_offset,
128                self.memory().heap_start,
129                self.memory().heap_alloc_offset
130            );
131
132            // Debug: Print register values
133            let reg_a_value = get_reg!(self, string_a);
134            let repeat_value = get_reg!(self, repeat_reg);
135            eprintln!("String A register {string_a}: 0x{reg_a_value:X}");
136            eprintln!("Repeat count register {repeat_reg}: {}", repeat_value);
137            eprintln!("Target register {target_string_reg}");
138        }
139
140        // Load the input string
141        let str_a = self.get_string(string_a);
142
143        let count = get_reg!(self, repeat_reg) as usize;
144
145        // Perform the repeat
146        let result = str_a.repeat(count);
147
148        #[cfg(feature = "debug_vm")]
149        if self.debug_operations_enabled {
150            eprintln!(
151                "Repeated string: \"{}\" (length: {}, repeated {} times)",
152                result,
153                result.len(),
154                count
155            );
156        }
157
158        // Store the result back into the target register
159        self.create_string(target_string_reg, &result);
160
161        #[cfg(feature = "debug_vm")]
162        if self.debug_operations_enabled {
163            let final_reg_value = get_reg!(self, target_string_reg);
164            eprintln!("Final target register value: 0x{final_reg_value:X}");
165        }
166    }
167
168
169    #[inline]
170    pub fn execute_string_cmp(&mut self, dest_reg: u8, string_a: u8, string_b: u8) {
171        #[cfg(feature = "debug_vm")]
172        if self.debug_operations_enabled {
173            eprintln!("=== STRING_COMPARE OPERATION ===");
174            eprintln!(
175                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
176                self.memory().constant_memory_size,
177                self.memory().stack_start,
178                self.memory().stack_offset,
179                self.memory().heap_start,
180                self.memory().heap_alloc_offset
181            );
182        }
183
184        // Debug: Print register values
185        let reg_a_value = get_reg!(self, string_a);
186        let reg_b_value = get_reg!(self, string_b);
187
188        #[cfg(feature = "debug_vm")]
189        if self.debug_operations_enabled {
190            eprintln!("String A register {string_a}: 0x{reg_a_value:X}");
191            eprintln!("String B register {string_b}: 0x{reg_b_value:X}");
192        }
193
194        let str_a = self.get_string(string_a);
195        let str_b = self.get_string(string_b);
196
197        let result = str_a == str_b;
198
199        #[cfg(feature = "debug_vm")]
200        if self.debug_operations_enabled {
201            eprintln!("String comparison result: {result}");
202        }
203
204        // Store the result
205        set_reg!(self, dest_reg, result as u32);
206    }
207
208    /// Return the same string but with quotes.
209    #[inline]
210    pub fn execute_string_to_string(&mut self, dest_reg: u8, source_string: u8) {
211        #[cfg(feature = "debug_vm")]
212        if self.debug_operations_enabled {
213            eprintln!("=== STRING_TO_STRING OPERATION ===");
214            eprintln!(
215                "Memory layout: constants: 0x0-0x{:X}, stack: 0x{:X}-0x{:X}, heap: 0x{:X}-0x{:X}",
216                self.memory().constant_memory_size,
217                self.memory().stack_start,
218                self.memory().stack_offset,
219                self.memory().heap_start,
220                self.memory().heap_alloc_offset
221            );
222
223            let source_reg_value = get_reg!(self, source_string);
224            eprintln!("Source string register {source_string}: 0x{source_reg_value:X}");
225        }
226
227        let source_str = self.get_string(source_string);
228
229        // Create the formatted string with quotes
230        let mut formatted_string = String::with_capacity(source_str.len() + 2);
231        formatted_string.push('"');
232        formatted_string.push_str(source_str);
233        formatted_string.push('"');
234
235        #[cfg(feature = "debug_vm")]
236        if self.debug_operations_enabled {
237            eprintln!(
238                "Formatted string: \"{}\" (length: {})",
239                formatted_string,
240                formatted_string.len()
241            );
242        }
243
244        self.create_string(dest_reg, &formatted_string);
245
246        let final_reg_value = get_reg!(self, dest_reg);
247
248        #[cfg(feature = "debug_vm")]
249        if self.debug_operations_enabled {
250            eprintln!("Final destination register value: 0x{final_reg_value:X}");
251        }
252    }
253
254    pub fn read_string(&self, heap_addr: u32, heap: &Memory) -> &str {
255        let string_header_ptr = heap.get_heap_const_ptr(heap_addr as usize) as *const VecHeader;
256        let string_header = unsafe { *string_header_ptr };
257
258        #[cfg(feature = "debug_vm")]
259        if self.debug_operations_enabled {
260            eprintln!(
261                "read_string: addr=0x{heap_addr:X}, capacity={}, byte_count={}, padding=0x{:X}",
262                string_header.capacity, string_header.element_count, string_header.padding
263            );
264        }
265
266        let byte_count = string_header.element_count as usize;
267
268        #[cfg(feature = "debug_vm")]
269        if string_header.element_count != 0 {
270            debug_assert_eq!(
271                string_header.padding, VEC_HEADER_MAGIC_CODE,
272                "CORRUPTION DETECTED in read_string: String header at 0x{heap_addr:X} has invalid padding 0x{:X}, should be 0x{VEC_HEADER_MAGIC_CODE:X}",
273                string_header.padding
274            );
275            debug_assert_eq!(
276                string_header.capacity, string_header.element_count,
277                "Corruption. strings should never change"
278            );
279            // TODO: just a hack for now to see if it is plausible.
280            debug_assert!(
281                byte_count < 1024,
282                "Strange. string byte_count {byte_count} is unreasonably large"
283            );
284        }
285
286        // String data follows directly after the header
287        let string_data_ptr = unsafe {
288            heap.get_heap_const_ptr(heap_addr as usize + VEC_HEADER_PAYLOAD_OFFSET.0 as usize)
289        };
290
291        unsafe {
292            let bytes = std::slice::from_raw_parts(string_data_ptr, byte_count);
293            match std::str::from_utf8(bytes) {
294                Ok(s) => s,
295                Err(e) => {
296                    panic!("ERROR: Invalid UTF-8 string data at 0x{heap_addr:X}: {e}");
297                    ""
298                }
299            }
300        }
301    }
302
303    /// Strings are immutable, can not be altered after they have been created.
304    /// They can be safely shared and the pointer can be blittable when inside composite types.
305    /// The string data is stored directly after the header in memory.
306    pub(crate) fn create_string(&mut self, dst_reg: u8, string: &str) {
307        let rune_bytes = string.as_bytes();
308        let byte_count = rune_bytes.len();
309        let cap_bytes = if byte_count == 0 { 1 } else { byte_count };
310
311        debug_assert!(
312            byte_count <= MAX_STRING_LEN as usize,
313            "String too large: {byte_count} bytes"
314        );
315
316        // Calculate total size needed: header + string data
317        // We assume that StringHeader is aligned to u32
318        let total_size = size_of::<VecHeader>() + byte_count;
319
320        let header_addr_in_heap = self.memory.heap_allocate_secret(total_size);
321
322        #[cfg(feature = "debug_vm")]
323        match self.memory.execution_mode {
324            ExecutionMode::ConstantEvaluation => {
325                // In constant evaluation, strings should be in heap which is directly after constant area
326                debug_assert!(
327                    header_addr_in_heap >= self.memory.heap_start as u32
328                        && header_addr_in_heap < self.memory.heap_alloc_offset as u32,
329                    "String allocation at 0x{header_addr_in_heap:X} is not in heap during constant evaluation",
330                );
331            }
332            ExecutionMode::NormalExecution => {
333                // In normal execution, strings should be in heap which is after stack
334                debug_assert!(
335                    header_addr_in_heap >= self.memory.heap_start as u32
336                        && header_addr_in_heap < self.memory.heap_alloc_offset as u32,
337                    "String allocation at 0x{header_addr_in_heap:X} is not in heap during normal execution",
338                );
339            }
340        }
341
342        let string_header = VecHeader {
343            capacity: cap_bytes as u16,
344            element_count: byte_count as u16,
345            element_size: 1,
346            padding: VEC_HEADER_MAGIC_CODE,
347        };
348
349        unsafe {
350            let header_ptr =
351                self.memory.get_heap_ptr(header_addr_in_heap as usize) as *mut VecHeader;
352            ptr::write(header_ptr, string_header);
353
354            let string_data_ptr = self
355                .memory
356                .get_heap_ptr(header_addr_in_heap as usize + VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
357            ptr::copy_nonoverlapping(rune_bytes.as_ptr(), string_data_ptr, byte_count);
358        }
359
360        #[cfg(feature = "debug_vm")]
361        if self.debug_operations_enabled {
362            eprintln!(
363                "Creating string: '{string}', header at 0x{header_addr_in_heap:X}, capacity={byte_count}, byte_count={byte_count}, padding=0x{VEC_HEADER_MAGIC_CODE:X}"
364            );
365        }
366
367        set_reg!(self, dst_reg, header_addr_in_heap);
368    }
369
370    #[inline]
371    pub fn execute_string_iter_init(
372        &mut self,
373        target_string_iterator_header_reg: u8,
374        string_header_reg: u8,
375    ) {
376        let string_header_addr = get_reg!(self, string_header_reg);
377
378        // Check that vec header is correct
379        let string_header_ptr = self
380            .memory
381            .get_heap_const_ptr(string_header_addr as usize)
382            .cast::<VecHeader>();
383        let string_header = unsafe { &*string_header_ptr };
384
385        if string_header.padding != VEC_HEADER_MAGIC_CODE {
386            return self.internal_trap(TrapCode::MemoryCorruption);
387        }
388        if string_header.capacity == 0 {
389            return self.internal_trap(TrapCode::VecNeverInitialized);
390        }
391
392        #[cfg(feature = "debug_vm")]
393        if self.debug_operations_enabled {
394            let iter_addr = get_reg!(self, target_string_iterator_header_reg);
395            eprintln!(
396                "string_iter_init: iter_addr: {iter_addr:04X} string_header_addr:{string_header_addr:04X} element_size: {}",
397                string_header.element_size
398            );
399        }
400        let string_iterator = StringIterator {
401            string_heap_ptr: string_header_addr,
402            byte_index: 0,
403            index: 0,
404        };
405
406        let string_iterator_mut_ptr =
407            self.get_ptr_from_reg(target_string_iterator_header_reg) as *mut StringIterator;
408
409        unsafe {
410            ptr::write(string_iterator_mut_ptr, string_iterator);
411        }
412    }
413
414    #[inline]
415    pub fn execute_string_iter_next(
416        &mut self,
417        string_iterator_header_reg: u8,
418        target_variable: u8,
419        branch_offset_lower: u8,
420        branch_offset_upper: u8,
421    ) {
422        let string_iterator =
423            self.get_string_iterator_header_ptr_from_reg(string_iterator_header_reg);
424
425        unsafe {
426            let string_header_addr = (*string_iterator).string_heap_ptr;
427            let string_header_ptr = self
428                .memory
429                .get_heap_const_ptr(string_header_addr as usize)
430                .cast::<VecHeader>();
431
432            let string_header_raw_ptr = self.memory.get_heap_const_ptr(string_header_addr as usize);
433
434            let string_header = &*string_header_ptr;
435            if string_header.padding != VEC_HEADER_MAGIC_CODE {
436                return self.internal_trap(TrapCode::MemoryCorruption);
437            }
438
439            #[cfg(feature = "debug_vm")]
440            if self.debug_operations_enabled {
441                let iter_addr = get_reg!(self, string_iterator_header_reg);
442                let index = (*string_iterator).byte_index;
443                eprintln!(
444                    "string_iter_next: iter_addr: {iter_addr:04X} addr:{string_header_addr:04X} index:{index} len: {}, capacity: {}",
445                    string_header.element_count, string_header.capacity
446                );
447            }
448
449            // Check if we've reached the end
450            if (*string_iterator).byte_index >= string_header.element_count {
451                // Jump to the provided address if we're done
452                let branch_offset = i16_from_u8s!(branch_offset_lower, branch_offset_upper);
453
454                #[cfg(feature = "debug_vm")]
455                {
456                    if self.debug_operations_enabled {
457                        eprintln!("string_iter_next complete. jumping with offset {branch_offset}");
458                    }
459                }
460
461                self.pc = (self.pc as i32 + branch_offset as i32) as usize;
462
463                return;
464            }
465
466            let current_byte_index = (*string_iterator).byte_index as usize;
467            let remaining_byte_count = (string_header.element_count as usize) - current_byte_index;
468            let payload_ptr = string_header_raw_ptr.add(VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
469
470            let remaining_bytes = std::slice::from_raw_parts(
471                payload_ptr.add(current_byte_index),
472                remaining_byte_count,
473            );
474
475            match std::str::from_utf8(remaining_bytes) {
476                Ok(valid_str) => {
477                    if let Some(c) = valid_str.chars().next() {
478                        // Place the decoded character (a Char - u32) into the target register
479                        // Advance the iterator by the actual byte width of the character
480                        let advancement = c.len_utf8() as u16;
481                        (*string_iterator).byte_index += advancement;
482
483                        let raw_u32 = c as u32;
484                        set_reg!(self, target_variable, raw_u32);
485                    } else {
486                        self.internal_trap(TrapCode::InvalidUtf8Sequence);
487                    }
488                }
489                Err(_) => {
490                    // The string data in the VM memory is corrupted/invalid
491                    self.internal_trap(TrapCode::InvalidUtf8Sequence);
492                }
493            }
494        }
495    }
496
497    #[inline]
498    pub fn execute_string_iter_next_pair(
499        &mut self,
500        string_iterator_header_reg: u8,
501        target_key_reg: u8,
502        target_value_reg: u8,
503        branch_offset_lower: u8,
504        branch_offset_upper: u8,
505    ) {
506        let string_iterator =
507            self.get_string_iterator_header_ptr_from_reg(string_iterator_header_reg);
508
509        unsafe {
510            let string_header_addr = (*string_iterator).string_heap_ptr;
511            let string_header_ptr = self
512                .memory
513                .get_heap_const_ptr(string_header_addr as usize)
514                .cast::<VecHeader>();
515
516            let string_header_raw_ptr = self.memory.get_heap_const_ptr(string_header_addr as usize);
517
518            let string_header = &*string_header_ptr;
519            if string_header.padding != VEC_HEADER_MAGIC_CODE {
520                return self.internal_trap(TrapCode::MemoryCorruption);
521            }
522
523            #[cfg(feature = "debug_vm")]
524            if self.debug_operations_enabled {
525                let iter_addr = get_reg!(self, string_iterator_header_reg);
526                let index = (*string_iterator).byte_index;
527                eprintln!(
528                    "string_iter_next: iter_addr: {iter_addr:04X} addr:{string_header_addr:04X} index:{index} len: {}, capacity: {}",
529                    string_header.element_count, string_header.capacity
530                );
531            }
532
533            // Check if we've reached the end
534            if (*string_iterator).byte_index >= string_header.element_count {
535                // Jump to the provided address if we're done
536                let branch_offset = i16_from_u8s!(branch_offset_lower, branch_offset_upper);
537
538                #[cfg(feature = "debug_vm")]
539                {
540                    if self.debug_operations_enabled {
541                        eprintln!("string_iter_next complete. jumping with offset {branch_offset}");
542                    }
543                }
544
545                self.pc = (self.pc as i32 + branch_offset as i32) as usize;
546
547                return;
548            }
549
550            let current_byte_index = (*string_iterator).byte_index as usize;
551            let remaining_byte_count = (string_header.element_count as usize) - current_byte_index;
552            let payload_ptr = string_header_raw_ptr.add(VEC_HEADER_PAYLOAD_OFFSET.0 as usize);
553
554            let remaining_bytes = std::slice::from_raw_parts(
555                payload_ptr.add(current_byte_index),
556                remaining_byte_count,
557            );
558
559            match std::str::from_utf8(remaining_bytes) {
560                Ok(valid_str) => {
561                    if let Some(c) = valid_str.chars().next() {
562                        // Place the decoded character (a Char - u32) into the target register
563                        // Advance the iterator by the actual byte width of the character
564                        let advancement = c.len_utf8() as u16;
565                        (*string_iterator).byte_index += advancement;
566
567                        let raw_u32 = c as u32;
568                        eprintln!(
569                            "raw: {raw_u32} advancement {advancement} -> r{target_value_reg}"
570                        );
571                        set_reg!(self, target_key_reg, (*string_iterator).index);
572                        set_reg!(self, target_value_reg, raw_u32);
573
574                        (*string_iterator).index += 1;
575                    } else {
576                        self.internal_trap(TrapCode::InvalidUtf8Sequence);
577                    }
578                }
579                Err(_) => {
580                    // The string data in the VM memory is corrupted/invalid
581                    self.internal_trap(TrapCode::InvalidUtf8Sequence);
582                }
583            }
584        }
585    }
586}