本文共 1113 字,大约阅读时间需要 3 分钟。
import requestsimport refrom lxml import etreewith open('real_case.html', 'r', encoding='utf-8') as f: c = f.read()tree = etree.HTML(c)table_element = tree.xpath("//div[@class='table-box'][1]/table/tbody/tr")#正则表达式过滤掉<>pattern1_attrib = re.compile(r"<.*?>")for row in table_element: try: td1 = row.xpath('td')[0] #调用tostring()后出现乱码 s1 = etree.tostring(td1).decode('utf-8') s1 = pattern1_attrib.sub('', s1) print(s1) except Exception as error: pass
乱码:
import requestsimport refrom lxml import etree#引入HTML包import htmlwith open('real_case.html', 'r', encoding='utf-8') as f: c = f.read()tree = etree.HTML(c)table_element = tree.xpath("//div[@class='table-box'][1]/table/tbody/tr")pattern1_attrib = re.compile(r"<.*?>")for row in table_element: try: td1 = row.xpath('td')[0] s1 = etree.tostring(td1).decode('utf-8') s1 = pattern1_attrib.sub('', s1) # unescape() 此函数使用HTML5标准定义的规则将字符转换成对应的unicode字符。 s1 = html.unescape(s1) print(s1) except Exception as error: pass
结果:
转载地址:http://ltar.baihongyu.com/