Skip to main content

mini_bitcoin_script/
tokenizer.rs

1use crate::error::ScriptError;
2use crate::hex::decode_hex;
3use crate::opcode::Opcode;
4use crate::token::Token;
5
6/// Parses raw script bytes into a sequence of tokens.
7///
8/// Walks the byte slice left-to-right, dispatching on each byte:
9/// - `0x01`-`0x4b`: direct push (byte value = data length)
10/// - `0x4c`: OP_PUSHDATA1 (1-byte length prefix)
11/// - `0x4d`: OP_PUSHDATA2 (2-byte little-endian length prefix)
12/// - `0x4e`: OP_PUSHDATA4 (4-byte little-endian length prefix)
13/// - All other bytes: looked up via [`Opcode::from_byte`]
14///
15/// Returns `ScriptError::UnexpectedEndOfScript` if a push-data instruction
16/// extends beyond the end of the byte slice, or
17/// `ScriptError::UnsupportedOpcode` for unrecognized byte values.
18pub fn parse_script(bytes: &[u8]) -> Result<Vec<Token>, ScriptError> {
19    let mut tokens = Vec::new();
20    let mut pos = 0;
21    let len = bytes.len();
22
23    while pos < len {
24        let byte = bytes[pos];
25        pos += 1;
26
27        match byte {
28            // Direct push: byte value is the data length (1-75 bytes)
29            0x01..=0x4b => {
30                let n = byte as usize;
31                if pos + n > len {
32                    return Err(ScriptError::UnexpectedEndOfScript);
33                }
34                tokens.push(Token::PushData(bytes[pos..pos + n].to_vec()));
35                pos += n;
36            }
37
38            // OP_PUSHDATA1: next 1 byte is the length
39            0x4c => {
40                if pos >= len {
41                    return Err(ScriptError::UnexpectedEndOfScript);
42                }
43                let n = bytes[pos] as usize;
44                pos += 1;
45                if pos + n > len {
46                    return Err(ScriptError::UnexpectedEndOfScript);
47                }
48                tokens.push(Token::PushData(bytes[pos..pos + n].to_vec()));
49                pos += n;
50            }
51
52            // OP_PUSHDATA2: next 2 bytes (little-endian) are the length
53            0x4d => {
54                if pos + 2 > len {
55                    return Err(ScriptError::UnexpectedEndOfScript);
56                }
57                let n = u16::from_le_bytes([bytes[pos], bytes[pos + 1]]) as usize;
58                pos += 2;
59                if pos + n > len {
60                    return Err(ScriptError::UnexpectedEndOfScript);
61                }
62                tokens.push(Token::PushData(bytes[pos..pos + n].to_vec()));
63                pos += n;
64            }
65
66            // OP_PUSHDATA4: next 4 bytes (little-endian) are the length
67            0x4e => {
68                if pos + 4 > len {
69                    return Err(ScriptError::UnexpectedEndOfScript);
70                }
71                let n = u32::from_le_bytes([
72                    bytes[pos],
73                    bytes[pos + 1],
74                    bytes[pos + 2],
75                    bytes[pos + 3],
76                ]) as usize;
77                pos += 4;
78                if pos + n > len {
79                    return Err(ScriptError::UnexpectedEndOfScript);
80                }
81                tokens.push(Token::PushData(bytes[pos..pos + n].to_vec()));
82                pos += n;
83            }
84
85            // All other bytes: look up as opcode
86            _ => match Opcode::from_byte(byte) {
87                Some(opcode) => tokens.push(Token::Op(opcode)),
88                None => return Err(ScriptError::UnsupportedOpcode(byte)),
89            },
90        }
91    }
92
93    Ok(tokens)
94}
95
96/// Parses a hex-encoded script string into tokens.
97///
98/// Convenience wrapper that decodes the hex string via [`decode_hex`],
99/// then passes the resulting bytes to [`parse_script`].
100pub fn parse_script_hex(hex: &str) -> Result<Vec<Token>, ScriptError> {
101    let bytes = decode_hex(hex)?;
102    parse_script(&bytes)
103}
104
105#[cfg(test)]
106mod tests {
107    use super::*;
108
109    #[test]
110    fn empty_script() {
111        let tokens = parse_script(&[]).unwrap();
112        assert!(tokens.is_empty());
113    }
114
115    #[test]
116    fn single_opcode() {
117        let tokens = parse_script(&[0x76]).unwrap(); // OP_DUP
118        assert_eq!(tokens, vec![Token::Op(Opcode::OpDup)]);
119    }
120
121    #[test]
122    fn direct_push_3_bytes() {
123        let tokens = parse_script(&[0x03, 0xaa, 0xbb, 0xcc]).unwrap();
124        assert_eq!(tokens, vec![Token::PushData(vec![0xaa, 0xbb, 0xcc])]);
125    }
126
127    #[test]
128    fn direct_push_truncated() {
129        let err = parse_script(&[0x03, 0xaa, 0xbb]).unwrap_err();
130        assert!(matches!(err, ScriptError::UnexpectedEndOfScript));
131    }
132
133    #[test]
134    fn pushdata1() {
135        let tokens = parse_script(&[0x4c, 0x02, 0xde, 0xad]).unwrap();
136        assert_eq!(tokens, vec![Token::PushData(vec![0xde, 0xad])]);
137    }
138
139    #[test]
140    fn pushdata1_missing_length() {
141        let err = parse_script(&[0x4c]).unwrap_err();
142        assert!(matches!(err, ScriptError::UnexpectedEndOfScript));
143    }
144
145    #[test]
146    fn pushdata1_truncated_data() {
147        let err = parse_script(&[0x4c, 0x05, 0x01, 0x02]).unwrap_err();
148        assert!(matches!(err, ScriptError::UnexpectedEndOfScript));
149    }
150
151    #[test]
152    fn pushdata2() {
153        // Length = 0x0003 (little-endian: 03 00)
154        let tokens = parse_script(&[0x4d, 0x03, 0x00, 0xaa, 0xbb, 0xcc]).unwrap();
155        assert_eq!(tokens, vec![Token::PushData(vec![0xaa, 0xbb, 0xcc])]);
156    }
157
158    #[test]
159    fn pushdata2_missing_length() {
160        let err = parse_script(&[0x4d, 0x03]).unwrap_err();
161        assert!(matches!(err, ScriptError::UnexpectedEndOfScript));
162    }
163
164    #[test]
165    fn pushdata4() {
166        // Length = 0x00000003 (little-endian: 03 00 00 00)
167        let tokens = parse_script(&[0x4e, 0x03, 0x00, 0x00, 0x00, 0xaa, 0xbb, 0xcc]).unwrap();
168        assert_eq!(tokens, vec![Token::PushData(vec![0xaa, 0xbb, 0xcc])]);
169    }
170
171    #[test]
172    fn pushdata4_missing_length() {
173        let err = parse_script(&[0x4e, 0x01, 0x00]).unwrap_err();
174        assert!(matches!(err, ScriptError::UnexpectedEndOfScript));
175    }
176
177    #[test]
178    fn unsupported_opcode() {
179        let err = parse_script(&[0x50]).unwrap_err(); // OP_RESERVED
180        assert!(matches!(err, ScriptError::UnsupportedOpcode(0x50)));
181    }
182
183    #[test]
184    fn op0_parses() {
185        let tokens = parse_script(&[0x00]).unwrap();
186        assert_eq!(tokens, vec![Token::Op(Opcode::Op0)]);
187    }
188
189    #[test]
190    fn multi_token_script() {
191        // OP_DUP OP_HASH160 <20 bytes> OP_EQUALVERIFY OP_CHECKSIG
192        let mut script = vec![0x76, 0xa9, 0x14]; // OP_DUP, OP_HASH160, push 20 bytes
193        script.extend_from_slice(&[0xab; 20]); // 20 bytes of data
194        script.push(0x88); // OP_EQUALVERIFY
195        script.push(0xac); // OP_CHECKSIG
196        let tokens = parse_script(&script).unwrap();
197        assert_eq!(tokens.len(), 5);
198        assert_eq!(tokens[0], Token::Op(Opcode::OpDup));
199        assert_eq!(tokens[1], Token::Op(Opcode::OpHash160));
200        assert_eq!(tokens[2], Token::PushData(vec![0xab; 20]));
201        assert_eq!(tokens[3], Token::Op(Opcode::OpEqualVerify));
202        assert_eq!(tokens[4], Token::Op(Opcode::OpCheckSig));
203    }
204
205    #[test]
206    fn parse_script_hex_roundtrip() {
207        let hex = "76a914" // OP_DUP OP_HASH160 push-20
208            .to_string()
209            + &"ab".repeat(20) // 20 bytes
210            + "88ac"; // OP_EQUALVERIFY OP_CHECKSIG
211        let tokens = parse_script_hex(&hex).unwrap();
212        assert_eq!(tokens.len(), 5);
213        assert_eq!(tokens[0], Token::Op(Opcode::OpDup));
214        assert_eq!(tokens[4], Token::Op(Opcode::OpCheckSig));
215    }
216
217    #[test]
218    fn parse_script_hex_invalid() {
219        let err = parse_script_hex("zzzz").unwrap_err();
220        assert!(matches!(err, ScriptError::InvalidHex));
221    }
222
223    #[test]
224    fn pushdata1_zero_length() {
225        let tokens = parse_script(&[0x4c, 0x00]).unwrap();
226        assert_eq!(tokens, vec![Token::PushData(vec![])]);
227    }
228
229    #[test]
230    fn direct_push_1_byte() {
231        let tokens = parse_script(&[0x01, 0xff]).unwrap();
232        assert_eq!(tokens, vec![Token::PushData(vec![0xff])]);
233    }
234}