codefuse-chatbot/web_crawler/utils/Html2Text.py

153 lines
7.0 KiB
Python
Raw Normal View History

2023-09-28 10:58:58 +08:00
import time
from bs4 import BeautifulSoup
import logging
import json
import os
from tqdm import tqdm
import re
from .DocTokenizer import DocTokenizer
logging.basicConfig(level=logging.INFO)
class Html2Text():
'''从html中提取text文本内容。
'''
def __init__(self):
pass
def html2text(self,
target_content_tag={},
target_tag_list=[],
html_dir=None,
text_dir=None,
mode="w",
is_get_all_text=False
):
'''
从html中提取text文本内容需要指定提取html中的tag标签输入为地址html文件保存在jsonl文件中输出也需要指定地址
:param target_content_tag: html中正文content所在tag字典格式限制长度为1key为选中便签类型name/class/idvaule为标签取值如div/title/article等
:param target_tag_list: 指定提取html对应的tag文本列表每个元素都与target_content_tag格式相同
:param is_get_all_text: True则将html页面所有text内容保存到all_text字典中False不保存all_text
:param html_dir: html数据地址注意需要时jsonl格式一行为一个json字典有text/url/host_url三个字段
:param text_dir: 将提取的text内容保存的地址同样是jsonl格式
:return: None
'''
assert isinstance(target_content_tag,dict), "target_content_tag请输入字典格式"
assert len(target_content_tag.keys()) <= 1,"target_content_tag属性字典只能指定唯一元素"
for _ in target_tag_list:
assert isinstance(_, dict), "target_tag_list列表元素需要字典格式"
assert len(_.keys()) <= 1, "target_tag_list列表中的属性字典只能指定唯一元素"
# 创建保存目录
os.makedirs(os.path.dirname(text_dir), exist_ok=True)
# 读取文件
logging.info("读取文件中……")
html_dict_list = self.read_html_jsonl(html_dir)
url_nums = len(html_dict_list)
logging.info("{url_nums}个html网址".format(url_nums=url_nums))
# 循环处理每行html数据html提取content正文、指定tag内容
text_dict_list = []
for html_dict in tqdm(html_dict_list, mininterval=1):
# 是否获取全部text内容
text_dict = self.get_text_dict(
html_dict=html_dict,
target_content_tag=target_content_tag,
target_tag_list=target_tag_list,
is_get_all_text=is_get_all_text
)
text_dict_list.append(text_dict)
logging.info("保存html提取的text内容……")
self.save_text_jsonl(json_list=text_dict_list,
file_path=text_dir,
mode=mode)
logging.info("保存成功!地址:%s" % text_dir)
def get_text_dict(self,
html_dict={},
target_content_tag={},
target_tag_list=[],
is_get_all_text=True
):
'''{"name":"div"}
提取html网页字符中的纯文本内容采用BeautifulSoup.get_text()获取全部text文本target_tag_list指定要提取文本的标签
:param html_dict: 网页返回的全部文本内容response.text和url
:param target_content_tag: html中正文content所在tag字典格式限制长度为1key为选中便签类型name/class/idvaule为标签取值如div/title/article等
:param target_tag_list: 指定提取html对应的tag文本列表每个元素都与target_content_tag格式相同
:return: text_content:{} 提取的text文本内容
'''
# 格式定义
assert isinstance(target_content_tag,dict), "target_content_tag请输入字典格式"
assert len(target_content_tag.keys()) <= 1,"target_content_tag属性字典只能指定唯一元素"
for _ in target_tag_list:
assert isinstance(_, dict), "target_tag_list列表元素需要字典格式"
assert len(_.keys()) <= 1, "target_tag_list列表中的属性字典只能指定唯一元素"
# 提取html的内容
html_content = html_dict['text']
url = html_dict['url']
host_url = html_dict['host_url']
# 创建BeautifulSoup对象
soup = BeautifulSoup(html_content, 'html.parser')
# 处理pre引用代码块添```引用
pre_tags = soup.find_all('code')
for pre_tag in pre_tags:
pre_tag.string = '\n```code\n' + pre_tag.get_text() + '\n```\n'
# 提取HTML中的文本内容
doc_tokenizer = DocTokenizer()
text_dict = {}
text_dict['url'] = url
text_dict['host_url'] = host_url
# 提取网页的title不存在则置空
try:
text_dict['title'] = soup.title.text
except:
text_dict['title'] = None
# 是否提取全部text不区分标签
if is_get_all_text:
all_text = soup.get_text(separator="", strip=False)
text_dict['all_text'] = doc_tokenizer.doc_process(all_text)
# 提取正文tag可以按照标签的class提取或按照tag名提取
if target_content_tag:
text_dict["content"] = self.soup_find_all_text(soup=soup,doc_tokenizer=doc_tokenizer,attrs=target_content_tag)
# 提取html中tag内容每个tag独立作为字段保存
for target_tag in target_tag_list:
if target_tag:
# 提取目标tag名
tag_ = list(target_tag.values())[0]
# 提取目标tag内容
text_dict[tag_] = self.soup_find_all_text(soup,doc_tokenizer,attrs=target_tag)
return text_dict
def soup_find_all_text(self,soup,doc_tokenizer,attrs):
assert isinstance(attrs,dict), "attrs请输入字典格式"
assert len(attrs.keys()) == 1,"attrs属性字典只能指定唯一元素"
if list(attrs.keys())[0]=="name":
_tags = soup.find_all(name=attrs["name"])
else:
_tags = soup.find_all(attrs=attrs)
tags_text = ""
for _tag in _tags:
tag_text = _tag.get_text(separator="", strip=False)
tag_text = doc_tokenizer.doc_process(tag_text)
tags_text += tag_text.strip() + "\n\n"
return tags_text
def read_html_jsonl(self, file_name=None):
'''
读取html的josnl文件
'''
html_dict_list = []
with open(file_name, "r", encoding="utf-8") as f:
for k, line in enumerate(f):
line = json.loads(line)
html_dict_list.append(line)
return html_dict_list
def save_text_jsonl(self, json_list=[], file_path=None, mode="w"):
'''
将json_list保存成jsonl格式文件
'''
with open(file_path, mode, encoding="utf-8") as f:
for line in json_list:
f.write(json.dumps(line, ensure_ascii=False) + "\n")