44 lines
1.1 KiB
Python
44 lines
1.1 KiB
Python
|
||
import re
|
||
|
||
# 非打印字符
|
||
NON_PRINTING_CHARS_RE = re.compile(
|
||
f"[{''.join(map(chr, list(range(0, 32)) + list(range(127, 160))))}]"
|
||
)
|
||
|
||
class DocTokenizer():
|
||
'''
|
||
文档text处理器。
|
||
'''
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
def doc_process(self, text):
|
||
'''
|
||
去除多余换行、去掉每行非打印字符和开头结尾空格
|
||
'''
|
||
# 去除多余换行
|
||
text = self.remove_excess_lines(text)
|
||
# 将文本拆分成行
|
||
lines = text.split("\n")
|
||
# 去掉每一行的开头和结尾的空格
|
||
lines = [self.remove_non_printing_char_line(
|
||
line.strip()) for line in lines]
|
||
# 将行重新组合成文本
|
||
text_new = "\n".join(lines)
|
||
return text_new
|
||
|
||
def remove_excess_lines(self, text):
|
||
'''
|
||
将2个以上的换行符替换为2个,html解析text时会产生大量换行\n
|
||
'''
|
||
pattern = r'\n\n+'
|
||
return re.sub(pattern, '\n\n', text)
|
||
|
||
def remove_non_printing_char_line(self, text):
|
||
'''
|
||
去除每一行的非打印字符
|
||
'''
|
||
return NON_PRINTING_CHARS_RE.sub("", text)
|