|
本帖最后由 艾幻翔 于 2017-8-25 13:57 编辑
献丑~- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # Created by lightwave on 2017/8/23
- import re
- ignore_name = ('p', 'span', 'a')
- ignore_param = ('align', 'href')
- reg_node = r"<(\w+)[\s\S*]*?>|</(\w+)>|<(\w+)(\s\S*)*?/>"
- reg_param = r'\s+(\w+)(=("?)[\s\S]*?\3)?'
- def sub_node(match):
- if match.group(1) and match.group(1) in ignore_name:
- match = match.group()
- if re.search(reg_param, match):
- return re.sub(reg_param, sub_param, match)
- else:
- return match
- elif match.group(2) and match.group(2) in ignore_name:
- return match.group()
- elif match.group(3) and match.group(3) in ignore_name:
- return match.group()
- return ''
- def sub_param(match):
- if match.group(1) in ignore_param:
- return match.group()
- return ''
- if __name__ == '__main__':
- data = r"""<p align="center" class="MsoNormal" style="text-align: center; margin: 0cm 28.25pt 0pt 0cm; line-height: 150%; mso-para-margin-right: 2.69gd"><b><span style="font-size: 22pt; font-family: 宋体; line-height: 150%; mso-ascii-theme-font: major-fareast; mso-fareast-theme-font: major-fareast; mso-hansi-theme-font: major-fareast">学习Python,求教高手</span></b></p>"""
- data = re.sub(reg_node, sub_node, data)
- print(data)
复制代码
|
|