Как правильно парсить HTML с символами типа ?
#!/usr/bin/python
from html5lib import HTMLParser, treebuilders
from lxml import etree
p = HTMLParser(
tree = treebuilders.getTreeBuilder("lxml"),
namespaceHTMLElements = False
)
rawdata = u"<td> &8230;</td>"
t = p.parse(rawdata).getroot()
print t
# from StringIO import StringIO
# from lxml.html import html5parser
# html5parser.parse(StringIO(u"<td>	</td>"))
ссылка1 как парсить /dev/random
ссылка2 лишь попытка решения
ссылка3 ссылка на старый issue в googlegroups
$ ./test2.py
Traceback (most recent call last):
File "./test2.py", line 10, in <module>
t = p.parse(rawdata).getroot()
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 211, in parse
parseMeta=parseMeta, useChardet=useChardet)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 111, in _parse
self.mainLoop()
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 174, in mainLoop
self.phase.processCharacters(token)
File "/usr/lib/pymodules/python2.6/html5lib/html5parser.py", line 948, in processCharacters
self.tree.insertText(token["data"])
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/_base.py", line 288, in insertText
parent.insertText(data)
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/etree_lxml.py", line 225, in insertText
builder.Element.insertText(self, data, insertBefore)
File "/usr/lib/pymodules/python2.6/html5lib/treebuilders/etree.py", line 114, in insertText
self._element.text += data
File "lxml.etree.pyx", line 821, in lxml.etree._Element.text.__set__ (src/lxml/lxml.etree.c:33308)
File "apihelpers.pxi", line 646, in lxml.etree._setNodeText (src/lxml/lxml.etree.c:15287)
File "apihelpers.pxi", line 1295, in lxml.etree._utf8 (src/lxml/lxml.etree.c:20212)
ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes