codefuse-chatbot/web_crawler/utils/DocTokenizer.py

44 lines
1.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
# 非打印字符
NON_PRINTING_CHARS_RE = re.compile(
f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
)
class DocTokenizer():
'''
文档text处理器。
'''
def __init__(self):
pass
def doc_process(self, text):
'''
去除多余换行、去掉每行非打印字符和开头结尾空格
'''
# 去除多余换行
text = self.remove_excess_lines(text)
# 将文本拆分成行
lines = text.split("\n")
# 去掉每一行的开头和结尾的空格
lines = [self.remove_non_printing_char_line(
line.strip()) for line in lines]
# 将行重新组合成文本
text_new = "\n".join(lines)
return text_new
def remove_excess_lines(self, text):
'''
将2个以上的换行符替换为2个html解析text时会产生大量换行\n
'''
pattern = r'\n\n+'
return re.sub(pattern, '\n\n', text)
def remove_non_printing_char_line(self, text):
'''
去除每一行的非打印字符
'''
return NON_PRINTING_CHARS_RE.sub("", text)