sgf_parse/
parser.rs

1use std::ptr::NonNull;
2
3use crate::go;
4use crate::lexer::{tokenize, LexerError, Token};
5use crate::unknown_game;
6use crate::{GameTree, GameType, SgfNode, SgfProp};
7
8/// Returns the [`GameTree`] values parsed from the provided text using default parsing options.
9///
10/// This function will attempt to convert non-FF\[4\] files to FF\[4\] if possible. Check out
11/// [`parse_with_options`] if you want to change the default behavior.
12///
13/// # Errors
14/// If the text can't be parsed as an SGF FF\[4\] collection, then an error is returned.
15///
16/// # Examples
17/// ```
18/// use sgf_parse::{parse, GameType};
19///
20/// let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
21/// let gametrees = parse(sgf).unwrap();
22/// assert!(gametrees.len() == 2);
23/// assert!(gametrees.iter().all(|gametree| gametree.gametype() == GameType::Go));
24/// ```
25pub fn parse(text: &str) -> Result<Vec<GameTree>, SgfParseError> {
26    parse_with_options(text, &ParseOptions::default())
27}
28
29/// Returns the [`GameTree`] values parsed from the provided text.
30///
31/// # Errors
32/// If the text can't be parsed as an SGF FF\[4\] collection, then an error is returned.
33///
34/// # Examples
35/// ```
36/// use sgf_parse::{parse_with_options, ParseOptions, GameType, SgfParseError};
37///
38/// // Default options
39/// let sgf = "(;SZ[9]C[Some comment];B[de];W[fe])(;B[de];W[ff])";
40/// let gametrees = parse_with_options(sgf, &ParseOptions::default()).unwrap();
41/// assert!(gametrees.len() == 2);
42/// assert!(gametrees.iter().all(|gametree| gametree.gametype() == GameType::Go));
43///
44/// // Strict FF[4] identifiers
45/// let sgf = "(;SZ[9]CoPyright[Julian Andrews 2025];B[de];W[fe])(;B[de];W[ff])";
46/// let parse_options = ParseOptions {
47///     convert_mixed_case_identifiers: false,
48///     ..ParseOptions::default()
49/// };
50/// let result = parse_with_options(sgf, &parse_options);
51/// assert_eq!(result, Err(SgfParseError::InvalidFF4Property));
52/// ```
53pub fn parse_with_options(
54    text: &str,
55    options: &ParseOptions,
56) -> Result<Vec<GameTree>, SgfParseError> {
57    let text = text.trim();
58    let tokens = if options.lenient {
59        tokenize(text)
60            .take_while(Result::is_ok)
61            .map(|result| result.unwrap().0)
62            .collect()
63    } else {
64        tokenize(text)
65            .map(|result| match result {
66                Err(e) => Err(SgfParseError::LexerError(e)),
67                Ok((token, _span)) => Ok(token),
68            })
69            .collect::<Result<Vec<_>, _>>()?
70    };
71    split_by_gametree(&tokens, options.lenient)?
72        .into_iter()
73        .map(|tokens| match find_gametype(tokens)? {
74            GameType::Go => parse_gametree::<go::Prop>(tokens, options),
75            GameType::Unknown => parse_gametree::<unknown_game::Prop>(tokens, options),
76        })
77        .collect::<Result<_, _>>()
78}
79
80/// Options for parsing SGF files.
81///
82/// # Examples
83/// See [`parse_with_options`] for usage examples.
84pub struct ParseOptions {
85    /// Whether to allow conversion of FF\[3\] mixed case identifiers to FF\[4\].
86    ///
87    /// All lower case letters are dropped.
88    /// This should allow parsing any older files which are valid, but not valid FF\[4\].
89    pub convert_mixed_case_identifiers: bool,
90    /// Whether to use lenient parsing.
91    ///
92    /// In lenient mode, the parser should never return an error, but will instead parse the SGF
93    /// until it hits an error, and then return whatever it's managed to parse.
94    pub lenient: bool,
95}
96
97impl Default for ParseOptions {
98    fn default() -> Self {
99        ParseOptions {
100            convert_mixed_case_identifiers: true,
101            lenient: false,
102        }
103    }
104}
105
106/// Error type for failures parsing sgf from text.
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108pub enum SgfParseError {
109    LexerError(LexerError),
110    UnexpectedGameTreeStart,
111    UnexpectedGameTreeEnd,
112    UnexpectedProperty,
113    UnexpectedEndOfData,
114    UnexpectedGameType,
115    InvalidFF4Property,
116}
117
118impl From<LexerError> for SgfParseError {
119    fn from(error: LexerError) -> Self {
120        Self::LexerError(error)
121    }
122}
123
124impl std::fmt::Display for SgfParseError {
125    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
126        match self {
127            SgfParseError::LexerError(e) => write!(f, "Error tokenizing: {e}"),
128            SgfParseError::UnexpectedGameTreeStart => write!(f, "Unexpected start of game tree"),
129            SgfParseError::UnexpectedGameTreeEnd => write!(f, "Unexpected end of game tree"),
130            SgfParseError::UnexpectedProperty => write!(f, "Unexpected property"),
131            SgfParseError::UnexpectedEndOfData => write!(f, "Unexpected end of data"),
132            SgfParseError::UnexpectedGameType => write!(f, "Unexpected game type"),
133            SgfParseError::InvalidFF4Property => {
134                write!(
135                    f,
136                    "Invalid FF[4] property without `convert_mixed_case_identifiers`"
137                )
138            }
139        }
140    }
141}
142
143impl std::error::Error for SgfParseError {}
144
145// Split the tokens up into individual gametrees.
146//
147// This will let us easily scan each gametree for GM properties.
148// Only considers StartGameTree/EndGameTree tokens.
149fn split_by_gametree(tokens: &[Token], lenient: bool) -> Result<Vec<&[Token]>, SgfParseError> {
150    let mut gametrees = vec![];
151    let mut gametree_depth: u64 = 0;
152    let mut slice_start = 0;
153    for (i, token) in tokens.iter().enumerate() {
154        match token {
155            Token::StartGameTree => gametree_depth += 1,
156            Token::EndGameTree => {
157                if gametree_depth == 0 {
158                    if lenient {
159                        break;
160                    } else {
161                        return Err(SgfParseError::UnexpectedGameTreeEnd);
162                    }
163                }
164                gametree_depth -= 1;
165                if gametree_depth == 0 {
166                    gametrees.push(&tokens[slice_start..=i]);
167                    slice_start = i + 1;
168                }
169            }
170            _ => {}
171        }
172    }
173    if gametree_depth != 0 {
174        if lenient {
175            // For lenient parsing assume all remaining tokens are part of the last gametree.
176            gametrees.push(&tokens[slice_start..]);
177        } else {
178            return Err(SgfParseError::UnexpectedEndOfData);
179        }
180    }
181
182    Ok(gametrees)
183}
184
185// Parse a single gametree of a known type.
186fn parse_gametree<Prop: SgfProp>(
187    tokens: &[Token],
188    options: &ParseOptions,
189) -> Result<GameTree, SgfParseError>
190where
191    SgfNode<Prop>: std::convert::Into<GameTree>,
192{
193    // TODO: Rewrite this without `unsafe`
194    let mut collection: Vec<SgfNode<Prop>> = vec![];
195    // //// Pointer to the `Vec` of children we're currently building.
196    let mut current_node_list_ptr = NonNull::new(&mut collection).unwrap();
197    // Stack of pointers to incomplete `Vec`s of children.
198    let mut incomplete_child_lists: Vec<NonNull<Vec<SgfNode<Prop>>>> = vec![];
199    //// Using pointers involves some unsafe calls, but should be ok here.
200    //// Since pointers are always initialized from real structs, and those structs
201    //// live for the whole function body, our only safety concern is dangling pointers.
202    ////
203    //// Since we build the tree traversing depth-first those structs shouldn't be
204    //// modified while the pointer is live. Heap-allocated contents of their
205    //// `children` may be modified, but that shouldn't change anything.
206
207    let mut tokens = tokens.iter().peekable();
208    while let Some(token) = tokens.next() {
209        match token {
210            Token::StartGameTree => {
211                // SGF game trees must have a root node.
212                if let Some(node_list_ptr) = incomplete_child_lists.last() {
213                    let node_list = unsafe { node_list_ptr.as_ref() };
214                    if node_list.is_empty() {
215                        if options.lenient {
216                            break;
217                        } else {
218                            return Err(SgfParseError::UnexpectedGameTreeStart);
219                        }
220                    }
221                }
222                incomplete_child_lists.push(current_node_list_ptr);
223            }
224            Token::EndGameTree => match incomplete_child_lists.pop() {
225                Some(node_list) => current_node_list_ptr = node_list,
226                None => {
227                    if options.lenient {
228                        break;
229                    } else {
230                        return Err(SgfParseError::UnexpectedGameTreeEnd);
231                    }
232                }
233            },
234            Token::StartNode => {
235                let mut new_node = SgfNode::default();
236                let mut prop_tokens = vec![];
237                while let Some(Token::Property(_)) = tokens.peek() {
238                    prop_tokens.push(tokens.next().unwrap());
239                }
240                for token in prop_tokens {
241                    match token {
242                        // TODO: Consider refactoring to consume tokens and clone of values.
243                        Token::Property((identifier, values)) => {
244                            let identifier = {
245                                if identifier.chars().all(|c| c.is_ascii_uppercase()) {
246                                    identifier.clone()
247                                } else if options.convert_mixed_case_identifiers {
248                                    identifier
249                                        .chars()
250                                        .filter(|c| c.is_ascii_uppercase())
251                                        .collect()
252                                } else if options.lenient {
253                                    break;
254                                } else {
255                                    return Err(SgfParseError::InvalidFF4Property);
256                                }
257                            };
258                            new_node
259                                .properties
260                                .push(Prop::new(identifier, values.clone()))
261                        }
262                        _ => unreachable!(),
263                    }
264                }
265                let node_list = unsafe { current_node_list_ptr.as_mut() };
266                node_list.push(new_node);
267                current_node_list_ptr =
268                    NonNull::new(&mut node_list.last_mut().unwrap().children).unwrap();
269            }
270            Token::Property(_) => {
271                if options.lenient {
272                    break;
273                } else {
274                    return Err(SgfParseError::UnexpectedProperty);
275                }
276            }
277        }
278    }
279
280    if !options.lenient && (!incomplete_child_lists.is_empty() || collection.len() != 1) {
281        return Err(SgfParseError::UnexpectedEndOfData);
282    }
283    let mut root_node = if options.lenient {
284        // A valid game tree must have at least a single (empty) node. So make one!
285        collection.into_iter().next().unwrap_or_default()
286    } else {
287        collection
288            .into_iter()
289            .next()
290            .ok_or(SgfParseError::UnexpectedEndOfData)?
291    };
292    root_node.is_root = true;
293    Ok(root_node.into())
294}
295
296// Figure out which game to parse from a slice of tokens.
297//
298// This function is necessary because we need to know the game before we can do the parsing.
299fn find_gametype(tokens: &[Token]) -> Result<GameType, SgfParseError> {
300    match find_gametree_root_prop_values("GM", tokens)? {
301        None => Ok(GameType::Go),
302        Some(values) => {
303            if values.len() != 1 {
304                return Ok(GameType::Unknown);
305            }
306            match values[0].as_str() {
307                "1" => Ok(GameType::Go),
308                _ => Ok(GameType::Unknown),
309            }
310        }
311    }
312}
313
314// Find the property values for a given identifier in the root node from the gametree's tokens.
315//
316// We use this to determine key root properties (like GM and FF) before parsing.
317// Returns an error if there's more than one match.
318fn find_gametree_root_prop_values<'a>(
319    prop_ident: &'a str,
320    tokens: &'a [Token],
321) -> Result<Option<&'a Vec<String>>, SgfParseError> {
322    // Find the matching property values in the first node.
323    // Skip the initial StartGameTree, StartNode tokens; we'll handle any errors later.
324    let matching_tokens: Vec<&Vec<String>> = tokens
325        .iter()
326        .skip(2)
327        .take_while(|&token| matches!(token, Token::Property(_)))
328        .filter_map(move |token| match token {
329            Token::Property((ident, values)) if ident == prop_ident => Some(values),
330            _ => None,
331        })
332        .collect();
333
334    match matching_tokens.len() {
335        0 => Ok(None),
336        1 => Ok(Some(matching_tokens[0])),
337        _ => Err(SgfParseError::UnexpectedProperty),
338    }
339}
340
341#[cfg(test)]
342mod test {
343    use super::*;
344    use crate::{go, serialize};
345
346    fn load_test_sgf() -> Result<String, Box<dyn std::error::Error>> {
347        // See https://www.red-bean.com/sgf/examples/
348        let mut sgf_path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"));
349        sgf_path.push("resources/test/ff4_ex.sgf");
350        let data = std::fs::read_to_string(sgf_path)?;
351
352        Ok(data)
353    }
354
355    fn get_go_nodes() -> Result<Vec<SgfNode<go::Prop>>, Box<dyn std::error::Error>> {
356        let data = load_test_sgf()?;
357
358        Ok(go::parse(&data)?)
359    }
360
361    fn node_depth(mut sgf_node: &SgfNode<go::Prop>) -> u64 {
362        let mut depth = 1;
363        while sgf_node.children().count() > 0 {
364            depth += 1;
365            sgf_node = sgf_node.children().next().unwrap();
366        }
367        depth
368    }
369
370    #[test]
371    fn sgf_has_two_gametrees() {
372        let sgf_nodes = get_go_nodes().unwrap();
373        assert_eq!(sgf_nodes.len(), 2);
374    }
375
376    #[test]
377    fn gametree_one_has_five_variations() {
378        let sgf_nodes = get_go_nodes().unwrap();
379        let sgf_node = &sgf_nodes[0];
380        assert_eq!(sgf_node.children().count(), 5);
381    }
382
383    #[test]
384    fn gametree_one_has_size_19() {
385        let sgf_nodes = get_go_nodes().unwrap();
386        let sgf_node = &sgf_nodes[0];
387        match sgf_node.get_property("SZ") {
388            Some(go::Prop::SZ(size)) => assert_eq!(size, &(19, 19)),
389            _ => unreachable!("Expected size property"),
390        }
391    }
392
393    #[test]
394    fn gametree_variation_depths() {
395        let sgf_nodes = get_go_nodes().unwrap();
396        let sgf_node = &sgf_nodes[0];
397        let children: Vec<_> = sgf_node.children().collect();
398        assert_eq!(node_depth(children[0]), 13);
399        assert_eq!(node_depth(children[1]), 4);
400        assert_eq!(node_depth(children[2]), 4);
401    }
402
403    #[test]
404    fn gametree_two_has_one_variation() {
405        let sgf_nodes = get_go_nodes().unwrap();
406        let sgf_node = &sgf_nodes[1];
407        assert_eq!(sgf_node.children().count(), 1);
408    }
409
410    #[test]
411    fn serialize_then_parse() {
412        let data = load_test_sgf().unwrap();
413        let gametrees = parse(&data).unwrap();
414        let text = serialize(&gametrees);
415        assert_eq!(gametrees, parse(&text).unwrap());
416    }
417
418    #[test]
419    fn invalid_property() {
420        let input = "(;GM[1]W[rp.pmonpoqprpsornqmpm])";
421        let sgf_nodes = go::parse(input).unwrap();
422        let expected = vec![
423            go::Prop::GM(1),
424            go::Prop::Invalid("W".to_string(), vec!["rp.pmonpoqprpsornqmpm".to_string()]),
425        ];
426
427        assert_eq!(sgf_nodes.len(), 1);
428        let sgf_node = &sgf_nodes[0];
429        assert_eq!(sgf_node.properties().cloned().collect::<Vec<_>>(), expected);
430    }
431
432    #[test]
433    fn unknown_game() {
434        let input = "(;GM[37]W[rp.pmonpoqprpsornqmpm])";
435        let gametrees = parse(input).unwrap();
436        assert_eq!(gametrees.len(), 1);
437        assert_eq!(gametrees[0].gametype(), GameType::Unknown);
438        let sgf_node = match &gametrees[0] {
439            GameTree::Unknown(node) => node,
440            _ => panic!("Unexpected game type"),
441        };
442        let expected = vec![
443            unknown_game::Prop::GM(37),
444            unknown_game::Prop::W("rp.pmonpoqprpsornqmpm".into()),
445        ];
446
447        assert_eq!(sgf_node.properties().cloned().collect::<Vec<_>>(), expected);
448    }
449
450    #[test]
451    fn mixed_games() {
452        let input = "(;GM[1];W[dd])(;GM[37]W[rp.pmonpoqprpsornqmpm])";
453        let gametrees = parse(input).unwrap();
454        assert_eq!(gametrees.len(), 2);
455        assert_eq!(gametrees[0].gametype(), GameType::Go);
456        assert_eq!(gametrees[1].gametype(), GameType::Unknown);
457    }
458
459    #[test]
460    fn stack_overflow() {
461        // This input generated a stack overflow with the old code
462        let input = "(;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;)";
463        let result = parse(&input);
464        assert!(result.is_ok());
465    }
466
467    #[test]
468    fn converts_up_ff3_property() {
469        let input = "(;GM[1]FF[3]CoPyright[test])";
470        let expected = vec![
471            go::Prop::GM(1),
472            go::Prop::FF(3),
473            go::Prop::CP("test".into()),
474        ];
475
476        let sgf_nodes = go::parse(input).unwrap();
477
478        assert_eq!(sgf_nodes.len(), 1);
479        let properties = sgf_nodes[0].properties().cloned().collect::<Vec<_>>();
480        assert_eq!(properties, expected);
481    }
482
483    #[test]
484    fn doesnt_convert_if_not_allowed() {
485        let input = "(;GM[1]FF[3]CoPyright[test])";
486        let parse_options = ParseOptions {
487            convert_mixed_case_identifiers: false,
488            ..ParseOptions::default()
489        };
490        let result = parse_with_options(input, &parse_options);
491        assert_eq!(result, Err(SgfParseError::InvalidFF4Property));
492    }
493
494    #[test]
495    fn compressed_list_for_unknown_game() {
496        let input = "(;GM[]MA[a:b])";
497        let gametree = parse(&input).unwrap().pop().unwrap();
498        let node = match gametree {
499            GameTree::Unknown(node) => node,
500            _ => panic!("Expected Unknown Game type"),
501        };
502        match node.get_property("MA") {
503            Some(unknown_game::Prop::MA(values)) => {
504                assert_eq!(values.len(), 1);
505                assert!(values.contains("a:b"));
506            }
507            _ => panic!("MA prop not found"),
508        }
509    }
510
511    #[test]
512    fn strips_whitespace() {
513        let input = "\n(;GM[1];B[cc])";
514        let sgf_nodes = go::parse(&input).unwrap();
515        assert_eq!(sgf_nodes.len(), 1);
516    }
517
518    #[test]
519    fn lenient_parsing_unclosed_parens_ok() {
520        let input = "\n(;GM[1];B[cc]";
521        let parse_options = ParseOptions {
522            lenient: true,
523            ..ParseOptions::default()
524        };
525        let game_trees = parse_with_options(input, &parse_options).unwrap();
526        assert_eq!(game_trees.len(), 1);
527    }
528
529    #[test]
530    fn lenient_parsing_ignores_trailing_garbage() {
531        let input = "\n(;GM[1];B[cc]))";
532        let parse_options = ParseOptions {
533            lenient: true,
534            ..ParseOptions::default()
535        };
536        let game_trees = parse_with_options(input, &parse_options).unwrap();
537        assert_eq!(game_trees.len(), 1);
538    }
539
540    #[test]
541    fn lenient_parsing_handles_unescaped_property_end() {
542        let input = "(;B[cc];W[dd];C[username [12k]: foo])";
543        let parse_options = ParseOptions {
544            lenient: true,
545            ..ParseOptions::default()
546        };
547        let game_trees = parse_with_options(input, &parse_options).unwrap();
548        assert_eq!(game_trees.len(), 1);
549        let sgf_node = game_trees[0].as_go_node().unwrap();
550        // Should parse up through "[12k]" successfully
551        assert_eq!(sgf_node.main_variation().count(), 3);
552    }
553
554    #[test]
555    fn lenient_parsing_handles_unclosed_property_value() {
556        let input = "(;B[cc];W[dd];B[ee";
557        let parse_options = ParseOptions {
558            lenient: true,
559            ..ParseOptions::default()
560        };
561        let game_trees = parse_with_options(input, &parse_options).unwrap();
562        assert_eq!(game_trees.len(), 1);
563        let sgf_node = game_trees[0].as_go_node().unwrap();
564        // Should find 3 nodes. The last unfinished node has no properties since "B[ee" is unclosed.
565        assert_eq!(sgf_node.main_variation().count(), 3);
566        assert_eq!(
567            sgf_node.main_variation().last().unwrap().properties.len(),
568            0
569        );
570    }
571
572    #[test]
573    fn lenient_parsing_handles_missing_property_value() {
574        let input = "(;B[cc];W[dd];B";
575        let parse_options = ParseOptions {
576            lenient: true,
577            ..ParseOptions::default()
578        };
579        let game_trees = parse_with_options(input, &parse_options).unwrap();
580        assert_eq!(game_trees.len(), 1);
581        let sgf_node = game_trees[0].as_go_node().unwrap();
582        // Should find 3 nodes. The last node has no properties since "B" is missing its value
583        assert_eq!(sgf_node.main_variation().count(), 3);
584        assert_eq!(
585            sgf_node.main_variation().last().unwrap().properties.len(),
586            0
587        );
588    }
589
590    #[test]
591    fn lenient_parsing_handles_missing_first_node_start() {
592        let input = "(B[cc])";
593        let parse_options = ParseOptions {
594            lenient: true,
595            ..ParseOptions::default()
596        };
597        let game_trees = parse_with_options(input, &parse_options).unwrap();
598        assert_eq!(game_trees.len(), 1);
599        let sgf_node = game_trees[0].as_go_node().unwrap();
600        // A single empty node.
601        assert_eq!(sgf_node.main_variation().count(), 1);
602        assert_eq!(
603            sgf_node.main_variation().last().unwrap().properties.len(),
604            0
605        );
606    }
607}