import re from qwen_agent.utils.utils import has_chinese_chars def rm_newlines(text): if text.endswith('-\n'): text = text[:-2] return text.strip() rep_c = ' ' if has_chinese_chars(text): rep_c = '' text = re.sub(r'(?<=[^\.。::\d])\n', rep_c, text) return text.strip() def rm_cid(text): text = re.sub(r'\(cid:\d+\)', '', text) return text def rm_hexadecimal(text): text = re.sub(r'[0-9A-Fa-f]{21,}', '', text) return text def rm_continuous_placeholders(text): text = re.sub(r'[.\- —。_*]{7,}', '\t', text) text = re.sub(r'\n{3,}', '\n\n', text) return text