import re
import _markupbase
from html import unescape
from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
interesting_normal = re.compile('[&<]')
incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
incomplete_charref = re.compile('&#(?:[0-9]|[xX][0-9a-fA-F])')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--!?>')
commentabruptclose = re.compile(r'-?>')
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
attrfind_tolerant = re.compile(r"""
(
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* )
([\t\n\r\f ]*=[\t\n\r\f ]* ('[^']*' |"[^"]*" |(?!['"])[^>\t\n\r\f ]* )
)?
(?:[\t\n\r\f ]|/(?!>))* """, re.VERBOSE)
locatetagend = re.compile(r"""
[a-zA-Z][^\t\n\r\f />]* [\t\n\r\f /]* (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* (?:[\t\n\r\f ]*=[\t\n\r\f ]* (?:'[^']*' |"[^"]*" |(?!['"])[^>\t\n\r\f ]* )
)?
[\t\n\r\f /]* )*
>?
""", re.VERBOSE)
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* (?:[\s/]* (?:(?<=['"\s/])[^\s/>][^\s/=>]* (?:\s*=+\s* (?:'[^']*' |"[^"]*" |(?!['"])[^>\s]* )
\s* )?(?:\s|/(?!>))*
)*
)?
\s* """, re.VERBOSE)
endendtag = re.compile('>')
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
def _replace_attr_charref(match):
ref = match.group(0)
if ref.startswith('&#'):
return unescape(ref)
if not ref.endswith('=') and ref[1:] in html5_entities:
return unescape(ref)
return ref
def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style", "xmp", "iframe", "noembed", "noframes")
RCDATA_CONTENT_ELEMENTS = ("textarea", "title")
def __init__(self, *, convert_charrefs=True, scripting=False):
super().__init__()
self.convert_charrefs = convert_charrefs
self.scripting = scripting
self.reset()
def reset(self):
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
self.cdata_elem = None
self._support_cdata = True
self._escapable = True
super().reset()
def feed(self, data):
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
self.goahead(1)
__starttag_text = None
def get_starttag_text(self):
return self.__starttag_text
def set_cdata_mode(self, elem, *, escapable=False):
self.cdata_elem = elem.lower()
self._escapable = escapable
if self.cdata_elem == 'plaintext':
self.interesting = re.compile(r'\z')
elif escapable and not self.convert_charrefs:
self.interesting = re.compile(r'&|</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
else:
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
re.IGNORECASE|re.ASCII)
def clear_cdata_mode(self):
self.interesting = interesting_normal
self.cdata_elem = None
self._escapable = True
def _set_support_cdata(self, flag=True):
self._support_cdata = flag
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.convert_charrefs and not self.cdata_elem:
j = rawdata.find('<', i)
if j < 0:
amppos = rawdata.rfind('&', max(i, n-34))
if (amppos >= 0 and
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
break j = n
else:
match = self.interesting.search(rawdata, i) if match:
j = match.start()
else:
if self.cdata_elem:
break
j = n
if i < j:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:j]))
else:
self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
if i == n: break
startswith = rawdata.startswith
if startswith('<', i):
if starttagopen.match(rawdata, i): k = self.parse_starttag(i)
elif startswith("</", i):
k = self.parse_endtag(i)
elif startswith("<!--", i):
k = self.parse_comment(i)
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
break
if k < 0:
if not end:
break
if starttagopen.match(rawdata, i): pass
elif startswith("</", i):
if i + 2 == n:
self.handle_data("</")
elif endtagopen.match(rawdata, i): pass
else:
self.handle_comment(rawdata[i+2:])
elif startswith("<!--", i):
j = n
for suffix in ("--!", "--", "-"):
if rawdata.endswith(suffix, i+4):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i) and self._support_cdata:
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
self.handle_comment(rawdata[i+2:])
elif startswith("<?", i):
self.handle_pi(rawdata[i+2:])
else:
raise AssertionError("we should not get here!")
k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete_charref.match(rawdata, i)
if match:
if end:
self.handle_charref(rawdata[i+2:])
i = self.updatepos(i, n)
break
break
elif i + 3 < n: self.handle_data("&#")
i = self.updatepos(i, i + 2)
else:
break
elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
k = match.end()
if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
continue
match = incomplete.match(rawdata, i)
if match:
if end:
self.handle_entityref(rawdata[i+1:])
i = self.updatepos(i, n)
break
break
elif i + 1 < n:
self.handle_data("&")
i = self.updatepos(i, i + 1)
else:
break
else:
assert 0, "interesting.search() lied"
if end and i < n:
if self.convert_charrefs and self._escapable:
self.handle_data(unescape(rawdata[i:n]))
else:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
def parse_html_declaration(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<!', ('unexpected call to '
'parse_html_declaration()')
if rawdata[i:i+4] == '<!--':
return self.parse_comment(i)
elif rawdata[i:i+9] == '<![CDATA[' and self._support_cdata:
j = rawdata.find(']]>', i+9)
if j < 0:
return -1
self.unknown_decl(rawdata[i+3: j])
return j + 3
elif rawdata[i:i+9].lower() == '<!doctype':
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
return -1
self.handle_decl(rawdata[i+2:gtpos])
return gtpos+1
else:
return self.parse_bogus_comment(i)
def parse_comment(self, i, report=True):
rawdata = self.rawdata
assert rawdata.startswith('<!--', i), 'unexpected call to parse_comment()'
match = commentclose.search(rawdata, i+4)
if not match:
match = commentabruptclose.match(rawdata, i+4)
if not match:
return -1
if report:
j = match.start()
self.handle_comment(rawdata[i+4: j])
return match.end()
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_bogus_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
if report:
self.handle_comment(rawdata[i+2:pos])
return pos + 1
def parse_pi(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
match = piclose.search(rawdata, i+2) if not match:
return -1
j = match.start()
self.handle_pi(rawdata[i+2: j])
j = match.end()
return j
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
attrs = []
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if (tag in self.CDATA_CONTENT_ELEMENTS or
(self.scripting and tag == "noscript") or
tag == "plaintext"):
self.set_cdata_mode(tag, escapable=False)
elif tag in self.RCDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag, escapable=True)
return endpos
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
match = locatetagend.match(rawdata, i+1)
assert match
j = match.end()
if rawdata[j-1] != ">":
return -1
return j
def parse_endtag(self, i):
rawdata = self.rawdata
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
if rawdata.find('>', i+2) < 0: return -1
if not endtagopen.match(rawdata, i): if rawdata[i+2:i+3] == '>': return i+3
else:
return self.parse_bogus_comment(i)
match = locatetagend.match(rawdata, i+2)
assert match
j = match.end()
if rawdata[j-1] != ">":
return -1
match = tagfind_tolerant.match(rawdata, i+2)
assert match
tag = match.group(1).lower()
self.handle_endtag(tag)
self.clear_cdata_mode()
return j
def handle_startendtag(self, tag, attrs):
self.handle_starttag(tag, attrs)
self.handle_endtag(tag)
def handle_starttag(self, tag, attrs):
pass
def handle_endtag(self, tag):
pass
def handle_charref(self, name):
pass
def handle_entityref(self, name):
pass
def handle_data(self, data):
pass
def handle_comment(self, data):
pass
def handle_decl(self, decl):
pass
def handle_pi(self, data):
pass
def unknown_decl(self, data):
pass