import re
import bs4
def parse_spec():
    with file('webapps.html') as f:
        soup = bs4.BeautifulSoup(f)
    return {
        'tokenization': soup.find(text='Tokenization').find_parent('div'),
    }
def tokenizer_state_ident(longname):
    longname = longname.lower()
    assert longname[-5:] == 'state'
    words = re.sub(r'[^a-z]', ' ', longname[:-5]).split()
    return ''.join(w.title() for w in words)
def extract_tokenizer_states(spec):
    with file('tokenizer/states.rs', 'w') as f:
        f.write('pub enum State {\n')
        for statedefn in spec['tokenization'].select('h5 > dfn'):
            f.write('    %s,\n' % (tokenizer_state_ident(statedefn.text)))
        f.write('}\n')
def extract_tokenizer_graph(spec):
    with file('build/states.dot', 'w') as f:
        f.write('strict digraph {\n')
        for sec in spec['tokenization'].select('h5'):
            name = sec.text
            if name == 'Tokenizing character references':
                continue
            ident = tokenizer_state_ident(name)
            txt = ''
            for sib in sec.next_siblings:
                if isinstance(sib, bs4.Tag):
                    if sib.name == 'h5':
                        break
                    txt += sib.get_text()
                else:
                    txt += sib
            for edge in re.finditer(r'[sS]witch to the (.* state)', txt):
                f.write('    %s -> %s;\n' % (ident, tokenizer_state_ident(edge.group(1))))
        f.write('}\n')
spec = parse_spec()
extract_tokenizer_graph(spec)