vb6parse/parsers/
vb6.rs

1use bstr::{BStr, ByteSlice};
2
3use winnow::{
4    ascii::{digit1, line_ending, space1, Caseless},
5    combinator::{alt, delimited, repeat},
6    error::ErrMode,
7    stream::Stream,
8    token::{literal, one_of, take_till, take_until, take_while},
9    Parser,
10};
11
12use crate::{errors::VB6ErrorKind, language::VB6Token, parsers::VB6Stream};
13
14pub type VB6Result<T> = Result<T, ErrMode<VB6ErrorKind>>;
15
16/// Parses a VB6 end-of-line comment.
17///
18/// The comment starts with a single quote and continues until the end of the
19/// line. It includes the single quote, but excludes the carriage return
20/// character, the newline character, and it does not consume the carriage
21/// return or newline character.
22///
23/// # Arguments
24///
25/// * `input` - The input to parse.
26///
27/// # Errors
28///
29/// Will return an error if it is not able to parse a comment. This can happen
30/// if the comment is not terminated by a newline character, or if the comment
31/// lacks a single quote.
32///
33/// # Returns
34///
35/// The comment with the single quote, but without carriage return, and
36/// newline characters.
37///
38/// # Example
39///
40/// ```rust
41/// use winnow::Parser;
42/// use vb6parse::parsers::{vb6::line_comment_parse, VB6Stream};
43///
44/// let mut input = VB6Stream::new("line_comment.bas".to_owned(), "' This is a comment\r\n".as_bytes());
45/// let comment = line_comment_parse.parse_next(&mut input).unwrap();
46///
47/// assert_eq!(comment, "' This is a comment");
48/// ```
49pub fn line_comment_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<&'a BStr> {
50    let comment = ('\'', take_till(0.., (b"\r\n", b"\n", b"\r")))
51        .take()
52        .parse_next(input)?;
53
54    Ok(comment)
55}
56
57/// Parses a VB6 variable name.
58///
59/// The variable name starts with a letter and can contain letters, numbers, and underscores.
60///
61/// # Arguments
62///
63/// * `input` - The input to parse.
64///
65/// # Errors
66///
67/// If the variable name is too long, it will return an error.
68///
69/// # Returns
70///
71/// The VB6 variable name.
72///
73/// # Example
74///
75/// ```rust
76/// use vb6parse::parsers::{vb6::variable_name_parse, VB6Stream};
77///
78/// let mut input = VB6Stream::new("variable_name_test.bas".to_owned(), "variable_name".as_bytes());
79/// let variable_name = variable_name_parse(&mut input).unwrap();
80///
81/// assert_eq!(variable_name, "variable_name");
82/// ```
83pub fn variable_name_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<&'a BStr> {
84    let variable_name = (
85        one_of(('a'..='z', 'A'..='Z', 128..=255)),
86        take_while(0.., ('_', 'a'..='z', 'A'..='Z', '0'..='9', 128..=255)),
87    )
88        .take()
89        .parse_next(input)?;
90
91    if variable_name.len() >= 255 {
92        return Err(ErrMode::Cut(VB6ErrorKind::VariableNameTooLong));
93    }
94
95    Ok(variable_name)
96}
97
98pub fn take_until_line_ending<'a>(input: &mut VB6Stream<'a>) -> VB6Result<&'a BStr> {
99    alt((take_until(1.., "\r\n"), take_until(1.., "\n"))).parse_next(input)
100}
101
102/// Parses a VB6 keyword.
103///
104/// The keyword is case-insensitive.
105///
106/// # Arguments
107///
108/// * `keyword` - The keyword to parse.
109///
110/// # Errors
111///
112/// If the keyword is not found, it will return an error.
113///
114/// # Returns
115///
116/// The keyword.
117///
118/// # Example
119///
120/// ```rust
121/// use vb6parse::{
122///     parsers::{vb6::keyword_parse, VB6Stream},
123///     errors::{VB6ErrorKind, VB6Error},
124/// };
125///
126/// use bstr::{BStr, ByteSlice};
127///
128/// let mut input1 = VB6Stream::new("test1.bas", "Option".as_bytes());
129/// let mut input2 = VB6Stream::new("test2.bas","op do".as_bytes());
130///
131/// let mut op_parse = keyword_parse("Op");
132///
133/// let keyword = op_parse(&mut input1);
134/// let keyword2 = op_parse(&mut input2);
135///
136/// assert!(keyword.is_err());
137/// assert_eq!(keyword2.unwrap(), b"op".as_bstr());
138/// ```
139pub fn keyword_parse<'a>(
140    keyword: &'static str,
141) -> impl FnMut(&mut VB6Stream<'a>) -> VB6Result<&'a BStr> {
142    move |input: &mut VB6Stream<'a>| -> VB6Result<&'a BStr> {
143        let checkpoint = input.checkpoint();
144
145        let word = Caseless(keyword).parse_next(input)?;
146
147        if one_of::<VB6Stream, _, VB6ErrorKind>(('_', 'a'..='z', 'A'..='Z', '0'..='9'))
148            .parse_next(input)
149            .is_ok()
150        {
151            input.reset(&checkpoint);
152
153            return Err(ErrMode::Backtrack(VB6ErrorKind::KeywordNotFound));
154        }
155
156        Ok(word)
157    }
158}
159
160/// Parses a VB6 string that may or may not contain double quotes (escaped by using two in a row).
161/// This parser will return the string without the double quotes.
162///
163/// # Example
164///
165/// ```
166/// use crate::*;
167/// use vb6parse::parsers::VB6Stream;
168/// use vb6parse::parsers::vb6::string_parse;
169///
170/// let input_line2 = b"\"This is a string\"";
171/// let input_line1 = b"\"This is also \"\"a\"\" string\"";
172///
173/// let mut stream1 = VB6Stream::new("", input_line1);
174/// let mut stream2 = VB6Stream::new("", input_line2);
175///
176/// let string1 = string_parse(&mut stream1).unwrap();
177/// let string2 = string_parse(&mut stream2).unwrap();
178///
179/// assert_eq!(string1, "This is also \"\"a\"\" string");
180/// assert_eq!(string2, "This is a string");
181/// ```
182pub fn string_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<&'a BStr> {
183    // We parse and build the string even though we won't actually return it
184    // since we will just directly build a BStr from the input stream
185    // THIS IS A HORRIBLE HACK! but at least it works.
186    // TODO: Figure out how to actually get this right. Perhaps when we
187    // change over all the BStrs to be owned types.
188    let mut build_string =
189        repeat(0.., string_fragment_parse).fold(Vec::new, |mut string, fragment| {
190            match fragment {
191                StringFragment::Literal(literal) => {
192                    string.extend_from_slice(literal.as_bytes());
193                }
194                StringFragment::EscapedDoubleQuote(double_qoutes) => {
195                    string.extend_from_slice(double_qoutes.as_bytes());
196                }
197            }
198            string
199        });
200
201    "\"".parse_next(input)?;
202    let start_index = input.index;
203
204    build_string.parse_next(input)?;
205
206    let end_index = input.index;
207    "\"".parse_next(input)?;
208
209    Ok(&input.stream[start_index..end_index])
210}
211
212enum StringFragment<'a> {
213    Literal(&'a BStr),
214    EscapedDoubleQuote(&'a BStr),
215}
216
217fn string_fragment_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<StringFragment<'a>> {
218    let fragment = alt((
219        "\"\"".take().map(StringFragment::EscapedDoubleQuote),
220        take_until(1.., "\"").map(StringFragment::Literal),
221    ))
222    .parse_next(input)?;
223
224    Ok(fragment)
225}
226
227/// Parses VB6 code into a token stream.
228///
229///
230/// # Arguments
231///
232/// * `input` - The input to parse.
233///
234/// # Returns
235///
236/// A vector of VB6 tokens.
237///
238/// # Errors
239///
240/// If the parser encounters an unknown token, it will return an error.
241///
242/// # Example
243///
244/// ```rust
245/// use vb6parse::language::VB6Token;
246/// use vb6parse::parsers::{VB6Stream, vb6_parse};
247///
248/// use bstr::{BStr, ByteSlice};
249///
250/// let mut input = VB6Stream::new("test.bas", b"Dim x As Integer");
251/// let tokens = vb6_parse(&mut input).unwrap();
252///
253/// assert_eq!(tokens.len(), 7);
254/// assert_eq!(tokens[0], VB6Token::DimKeyword("Dim".into()));
255/// assert_eq!(tokens[1], VB6Token::Whitespace(" ".into()));
256/// assert_eq!(tokens[2], VB6Token::VariableName("x".into()));
257/// assert_eq!(tokens[3], VB6Token::Whitespace(" ".into()));
258/// assert_eq!(tokens[4], VB6Token::AsKeyword("As".into()));
259/// assert_eq!(tokens[5], VB6Token::Whitespace(" ".into()));
260/// assert_eq!(tokens[6], VB6Token::IntegerKeyword("Integer".into()));
261/// ```
262pub fn vb6_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<Vec<VB6Token<'a>>> {
263    let mut tokens = Vec::new();
264
265    if !is_english_code(input.stream) {
266        return Err(ErrMode::Cut(VB6ErrorKind::LikelyNonEnglishCharacterSet));
267    }
268
269    while !input.is_empty() {
270        // The file should end if there is a null byte.
271        if literal::<_, _, VB6ErrorKind>('\0')
272            .parse_next(input)
273            .is_ok()
274        {
275            break;
276        }
277
278        if let Ok(token) = line_ending::<VB6Stream<'a>, VB6ErrorKind>.parse_next(input) {
279            let token = VB6Token::Newline(token);
280            tokens.push(token);
281            continue;
282        }
283
284        if let Ok(token) = line_comment_parse.parse_next(input) {
285            let token = VB6Token::Comment(token);
286            tokens.push(token);
287            continue;
288        }
289
290        if let Ok(token) = delimited::<VB6Stream<'a>, _, &BStr, _, VB6ErrorKind, _, _, _>(
291            '\"',
292            take_till(0.., '\"'),
293            '\"',
294        )
295        .take()
296        .parse_next(input)
297        {
298            let token = VB6Token::StringLiteral(token);
299            tokens.push(token);
300            continue;
301        }
302
303        if let Ok(token) = vb6_token_parse.parse_next(input) {
304            tokens.push(token);
305            continue;
306        }
307
308        return Err(ErrMode::Cut(VB6ErrorKind::UnknownToken));
309    }
310
311    Ok(tokens)
312}
313
314#[must_use]
315pub fn is_english_code(content: &BStr) -> bool {
316    // We are looking to see if we have a large-ish number of higher half ANSI characters.
317    let character_count = content.len();
318    let higher_half_character_count = content.iter().filter(|&c| *c >= 128).count();
319
320    higher_half_character_count == 0 || (100 * higher_half_character_count / character_count) < 1
321}
322
323fn vb6_keyword_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<VB6Token<'a>> {
324    // 'alt' only allows for a limited number of parsers to be passed in.
325    // so we need to chain the 'alt' parsers together.
326    alt((
327        alt((
328            keyword_parse("Type").map(|token: &BStr| VB6Token::TypeKeyword(token)),
329            keyword_parse("Optional").map(|token: &BStr| VB6Token::OptionalKeyword(token)),
330            keyword_parse("Option").map(|token: &BStr| VB6Token::OptionKeyword(token)),
331            keyword_parse("Explicit").map(|token: &BStr| VB6Token::ExplicitKeyword(token)),
332            keyword_parse("Private").map(|token: &BStr| VB6Token::PrivateKeyword(token)),
333            keyword_parse("Public").map(|token: &BStr| VB6Token::PublicKeyword(token)),
334            keyword_parse("Dim").map(|token: &BStr| VB6Token::DimKeyword(token)),
335            keyword_parse("With").map(|token: &BStr| VB6Token::WithKeyword(token)),
336            keyword_parse("Declare").map(|token: &BStr| VB6Token::DeclareKeyword(token)),
337            keyword_parse("Lib").map(|token: &BStr| VB6Token::LibKeyword(token)),
338            keyword_parse("Const").map(|token: &BStr| VB6Token::ConstKeyword(token)),
339            keyword_parse("As").map(|token: &BStr| VB6Token::AsKeyword(token)),
340            keyword_parse("Enum").map(|token: &BStr| VB6Token::EnumKeyword(token)),
341            keyword_parse("Long").map(|token: &BStr| VB6Token::LongKeyword(token)),
342            keyword_parse("Integer").map(|token: &BStr| VB6Token::IntegerKeyword(token)),
343            keyword_parse("Boolean").map(|token: &BStr| VB6Token::BooleanKeyword(token)),
344            keyword_parse("Byte").map(|token: &BStr| VB6Token::ByteKeyword(token)),
345            keyword_parse("Single").map(|token: &BStr| VB6Token::SingleKeyword(token)),
346            keyword_parse("String").map(|token: &BStr| VB6Token::StringKeyword(token)),
347        )),
348        alt((
349            keyword_parse("True").map(|token: &BStr| VB6Token::TrueKeyword(token)),
350            keyword_parse("False").map(|token: &BStr| VB6Token::FalseKeyword(token)),
351            keyword_parse("Function").map(|token: &BStr| VB6Token::FunctionKeyword(token)),
352            keyword_parse("Sub").map(|token: &BStr| VB6Token::SubKeyword(token)),
353            keyword_parse("End").map(|token: &BStr| VB6Token::EndKeyword(token)),
354            keyword_parse("If").map(|token: &BStr| VB6Token::IfKeyword(token)),
355            keyword_parse("Else").map(|token: &BStr| VB6Token::ElseKeyword(token)),
356            keyword_parse("And").map(|token: &BStr| VB6Token::AndKeyword(token)),
357            keyword_parse("Or").map(|token: &BStr| VB6Token::OrKeyword(token)),
358            keyword_parse("Not").map(|token: &BStr| VB6Token::NotKeyword(token)),
359            keyword_parse("Then").map(|token: &BStr| VB6Token::ThenKeyword(token)),
360            keyword_parse("For").map(|token: &BStr| VB6Token::ForKeyword(token)),
361            keyword_parse("To").map(|token: &BStr| VB6Token::ToKeyword(token)),
362            keyword_parse("Step").map(|token: &BStr| VB6Token::StepKeyword(token)),
363            keyword_parse("Next").map(|token: &BStr| VB6Token::NextKeyword(token)),
364            keyword_parse("ReDim").map(|token: &BStr| VB6Token::ReDimKeyword(token)),
365            keyword_parse("ByVal").map(|token: &BStr| VB6Token::ByValKeyword(token)),
366            keyword_parse("ByRef").map(|token: &BStr| VB6Token::ByRefKeyword(token)),
367            keyword_parse("Goto").map(|token: &BStr| VB6Token::GotoKeyword(token)),
368            keyword_parse("Exit").map(|token: &BStr| VB6Token::ExitKeyword(token)),
369        )),
370    ))
371    .parse_next(input)
372}
373
374fn vb6_symbol_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<VB6Token<'a>> {
375    // 'alt' only allows for a limited number of parsers to be passed in.
376    // so we need to chain the 'alt' parsers together.
377    alt((
378        alt((
379            "=".map(|token: &BStr| VB6Token::EqualityOperator(token)),
380            "$".map(|token: &BStr| VB6Token::DollarSign(token)),
381            "_".map(|token: &BStr| VB6Token::Underscore(token)),
382            "&".map(|token: &BStr| VB6Token::Ampersand(token)),
383            "%".map(|token: &BStr| VB6Token::Percent(token)),
384            "#".map(|token: &BStr| VB6Token::Octothorpe(token)),
385            "<".map(|token: &BStr| VB6Token::LessThanOperator(token)),
386            ">".map(|token: &BStr| VB6Token::GreaterThanOperator(token)),
387            "(".map(|token: &BStr| VB6Token::LeftParanthesis(token)),
388            ")".map(|token: &BStr| VB6Token::RightParanthesis(token)),
389            ",".map(|token: &BStr| VB6Token::Comma(token)),
390            "+".map(|token: &BStr| VB6Token::AdditionOperator(token)),
391            "-".map(|token: &BStr| VB6Token::SubtractionOperator(token)),
392            "*".map(|token: &BStr| VB6Token::MultiplicationOperator(token)),
393            "\\".map(|token: &BStr| VB6Token::BackwardSlashOperator(token)),
394            "/".map(|token: &BStr| VB6Token::DivisionOperator(token)),
395            ".".map(|token: &BStr| VB6Token::PeriodOperator(token)),
396            ":".map(|token: &BStr| VB6Token::ColonOperator(token)),
397            "^".map(|token: &BStr| VB6Token::ExponentiationOperator(token)),
398        )),
399        alt((
400            "!".map(|token: &BStr| VB6Token::ExclamationMark(token)),
401            "[".map(|token: &BStr| VB6Token::LeftSquareBracket(token)),
402            "]".map(|token: &BStr| VB6Token::RightSquareBracket(token)),
403            ";".map(|token: &BStr| VB6Token::Semicolon(token)),
404            "@".map(|token: &BStr| VB6Token::AtSign(token)),
405        )),
406    ))
407    .parse_next(input)
408}
409
410fn vb6_token_parse<'a>(input: &mut VB6Stream<'a>) -> VB6Result<VB6Token<'a>> {
411    // 'alt' only allows for a limited number of parsers to be passed in.
412    // so we need to chain the 'alt' parsers together.
413    alt((
414        (line_comment_parse).map(|token: &BStr| VB6Token::Comment(token)),
415        vb6_keyword_parse,
416        vb6_symbol_parse,
417        alt((
418            digit1.map(|token: &BStr| VB6Token::Number(token)),
419            variable_name_parse.map(|token: &BStr| VB6Token::VariableName(token)),
420            space1.map(|token: &BStr| VB6Token::Whitespace(token)),
421        )),
422    ))
423    .parse_next(input)
424}
425
426#[cfg(test)]
427mod test {
428    use super::*;
429    use bstr::ByteSlice;
430
431    #[test]
432    fn no_escaped_double_quote_string_parse() {
433        let input_line = b"\"This is a string\"\r\n";
434        let mut stream = VB6Stream::new("", input_line);
435        let string = string_parse(&mut stream).unwrap();
436
437        assert_eq!(string, "This is a string");
438    }
439
440    #[test]
441    fn contains_escaped_double_quote_string_parse() {
442        let input_line = b"\"This is also \"\"a\"\" string\"\r\n";
443        let mut stream = VB6Stream::new("", input_line);
444        let string = string_parse(&mut stream).unwrap();
445
446        assert_eq!(string, "This is also \"\"a\"\" string");
447    }
448
449    #[test]
450    fn keyword() {
451        let mut input1 = VB6Stream::new("", "option".as_bytes());
452        let mut input2 = VB6Stream::new("", "op do".as_bytes());
453
454        let mut op_parse = keyword_parse("op");
455
456        let keyword = op_parse(&mut input1);
457        let keyword2 = op_parse(&mut input2);
458
459        assert!(keyword.is_err());
460        assert!(keyword2.is_ok());
461        assert_eq!(keyword2.unwrap(), b"op".as_bstr());
462    }
463
464    #[test]
465    fn eol_comment_carriage_return_newline() {
466        use crate::parsers::VB6Stream;
467        use crate::vb6::line_comment_parse;
468
469        let mut input = VB6Stream::new("", "' This is a comment\r\n".as_bytes());
470        let comment = line_comment_parse(&mut input).unwrap();
471
472        assert_eq!(comment, "' This is a comment");
473    }
474
475    #[test]
476    fn eol_comment_newline() {
477        use crate::parsers::VB6Stream;
478        use crate::vb6::line_comment_parse;
479
480        let mut input = VB6Stream::new("", "' This is a comment\n".as_bytes());
481        let comment = line_comment_parse(&mut input).unwrap();
482
483        assert_eq!(comment, "' This is a comment");
484    }
485
486    #[test]
487    fn eol_comment_carriage_return() {
488        use crate::parsers::VB6Stream;
489        use crate::vb6::line_comment_parse;
490
491        let mut input = VB6Stream::new("", "' This is a comment\r".as_bytes());
492        let comment = line_comment_parse(&mut input).unwrap();
493
494        assert_eq!(comment, "' This is a comment");
495    }
496
497    #[test]
498    fn eol_comment_eof() {
499        use crate::parsers::VB6Stream;
500        use crate::vb6::line_comment_parse;
501
502        let mut input = VB6Stream::new("", "' This is a comment".as_bytes());
503        let comment = line_comment_parse(&mut input).unwrap();
504
505        assert_eq!(comment, "' This is a comment");
506    }
507
508    #[test]
509    fn variable_name() {
510        use crate::parsers::VB6Stream;
511        use crate::vb6::variable_name_parse;
512
513        let mut input = VB6Stream::new("", "variable_name".as_bytes());
514
515        let variable_name = variable_name_parse(&mut input).unwrap();
516
517        assert_eq!(variable_name, "variable_name");
518    }
519
520    #[test]
521    fn vb6_parse() {
522        use crate::parsers::VB6Stream;
523        use crate::vb6::{vb6_parse, VB6Token};
524
525        let mut input = VB6Stream::new("", "Dim x As Integer".as_bytes());
526        let tokens = vb6_parse(&mut input).unwrap();
527
528        assert_eq!(tokens.len(), 7);
529        assert_eq!(tokens[0], VB6Token::DimKeyword("Dim".into()));
530        assert_eq!(tokens[1], VB6Token::Whitespace(" ".into()));
531        assert_eq!(tokens[2], VB6Token::VariableName("x".into()));
532        assert_eq!(tokens[3], VB6Token::Whitespace(" ".into()));
533        assert_eq!(tokens[4], VB6Token::AsKeyword("As".into()));
534        assert_eq!(tokens[5], VB6Token::Whitespace(" ".into()));
535        assert_eq!(tokens[6], VB6Token::IntegerKeyword("Integer".into()));
536    }
537
538    #[test]
539    fn non_english_parse() {
540        use crate::vb6::vb6_parse;
541        use crate::vb6::VB6Stream;
542
543        let code = "Option Explicit\r
544Public app_path As String  '���|�]�w�X\r
545Public ����H����ԤH��(1 To 2, 1 To 2) As Integer    '�����Ԩ���H�Ƭ�����(1.�ϥΪ�/2.�q��,1.�`�@�H��/2.�ثe�ĴX��)\r
546Public ����ݾ��H��������(1 To 2, 1 To 3) As Integer    '����ݾ�����H���s��������(1.�ϥΪ�/2.�q��,1.���W����/2~3.�ݾ������n��s��)\r
547Public �Ĥ@���Ұ�Ū�J�{�ǼаO As Boolean    '�Ĥ@���Ұʵ{��Ū�J�{�ǼаO��\r
548Attribute �Ĥ@���Ұ�Ū�J�{�ǼаO.VB_VarUserMemId = 1073741834\r
549Public �����ˬd����ؼм� As Integer    '�����ˬd����p�ƾ��ؼм�\r
550Attribute �����ˬd����ؼм�.VB_VarUserMemId = 1073741836\r
551Public �q������O�_�w�X�{ As Boolean    '���ҳq������O�_�w�g�X�{�Ȯ��ܼ�\r
552Attribute �q������O�_�w�X�{.VB_VarUserMemId = 1073741837\r
553Public ProgramIsOnWine As Boolean    '�{���O�_�B��Wine���ҤU����\r
554Attribute ProgramIsOnWine.VB_VarUserMemId = 1073741838";
555
556        let mut input = VB6Stream::new("", code.as_bytes());
557
558        let result = vb6_parse(&mut input);
559
560        assert!(result.is_err());
561        assert!(matches!(
562            result.unwrap_err(),
563            ErrMode::Cut(VB6ErrorKind::LikelyNonEnglishCharacterSet)
564        ));
565    }
566
567    #[test]
568    fn multi_keyword() {
569        use crate::vb6::keyword_parse;
570
571        let mut input = VB6Stream::new("", "Option As Integer".as_bytes());
572
573        let key1 = keyword_parse("Option").parse_next(&mut input).unwrap();
574
575        let _ = space1::<_, VB6ErrorKind>.parse_next(&mut input);
576
577        let key2 = keyword_parse("As").parse_next(&mut input).unwrap();
578
579        let _ = space1::<_, VB6ErrorKind>.parse_next(&mut input);
580
581        let key3 = keyword_parse("Integer").parse_next(&mut input).unwrap();
582
583        assert_eq!(key1, "Option");
584        assert_eq!(key2, "As");
585        assert_eq!(key3, "Integer");
586    }
587}