| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195 |
- """
- 提取 Word 文档中所有样式的完整 XML 定义(不遗漏任何属性)。
- 目标文件: 2026年Q2季度报告_E-EdHk4r1Lg.doc
- """
- import json
- from pathlib import Path
- from docx import Document
- from docx.oxml.ns import qn
- from lxml import etree
- DOC_PATH = Path(__file__).parent / "default.docx"
- OUTPUT_PATH = Path(__file__).parent / "default.json"
- def emu_to_pt(emu) -> float | None:
- """EMU 转磅(1 pt = 12700 EMU)"""
- if emu is None:
- return None
- return round(int(emu) / 12700, 2)
- def extract_font(style) -> dict | None:
- """通过 API 提取字体信息(作为便捷摘要)"""
- try:
- f = style.font
- color_rgb = None
- try:
- if f.color and f.color.type:
- color_rgb = str(f.color.rgb)
- except Exception:
- pass
- return {
- "name": f.name,
- "size_pt": emu_to_pt(f.size),
- "bold": f.bold,
- "italic": f.italic,
- "underline": f.underline,
- "color_rgb": color_rgb,
- "strike": f.strike,
- "all_caps": f.all_caps,
- "small_caps": f.small_caps,
- }
- except Exception:
- return None
- def extract_paragraph_format(style) -> dict | None:
- """通过 API 提取段落格式(作为便捷摘要)"""
- try:
- pf = style.paragraph_format
- return {
- "alignment": str(pf.alignment) if pf.alignment else None,
- "left_indent_pt": emu_to_pt(pf.left_indent),
- "right_indent_pt": emu_to_pt(pf.right_indent),
- "first_line_indent_pt": emu_to_pt(pf.first_line_indent),
- "space_before_pt": emu_to_pt(pf.space_before),
- "space_after_pt": emu_to_pt(pf.space_after),
- "line_spacing": float(pf.line_spacing) if pf.line_spacing else None,
- "keep_together": pf.keep_together,
- "keep_with_next": pf.keep_with_next,
- "page_break_before": pf.page_break_before,
- }
- except Exception:
- return None
- def element_to_dict(elem: etree._Element) -> dict | list | str | None:
- """
- 将 lxml Element 转换为字典,完整保留标签、属性、文本和子元素。
- 处理重复子元素(转为列表)。
- 使用 '{namespace}localname' 格式作为标签名。
- """
- # 标签名(完整 Clark 表示法)
- tag = elem.tag
- # 属性字典
- attrib = dict(elem.attrib)
- # 子元素处理
- children = list(elem)
- if children:
- # 子元素可能重复,使用字典存储列表
- child_dict = {}
- for child in children:
- child_tag = child.tag
- child_val = element_to_dict(child)
- if child_tag in child_dict:
- # 相同标签名出现多次,转为列表
- if not isinstance(child_dict[child_tag], list):
- child_dict[child_tag] = [child_dict[child_tag]]
- child_dict[child_tag].append(child_val)
- else:
- child_dict[child_tag] = child_val
- # 合并文本:如果存在文本(非空白),作为 '#text' 字段
- text = elem.text.strip() if elem.text else None
- tail = elem.tail.strip() if elem.tail else None
- result = {"@tag": tag, "@attrib": attrib, "@children": child_dict}
- if text:
- result["#text"] = text
- if tail:
- result["#tail"] = tail
- return result
- else:
- # 叶子节点:直接返回文本或属性+文本
- text = elem.text.strip() if elem.text else None
- tail = elem.tail.strip() if elem.tail else None
- if attrib or text or tail:
- result = {"@tag": tag, "@attrib": attrib}
- if text:
- result["#text"] = text
- if tail:
- result["#tail"] = tail
- return result
- else:
- # 完全空的元素,可简化为 None 但保持结构
- return {"@tag": tag, "@attrib": attrib}
- def extract_style_full_xml(style) -> dict:
- """提取样式的完整 XML 定义(转换为字典)"""
- elem = style.element
- if elem is None:
- return None
- # 整个样式元素转换为字典
- style_dict = element_to_dict(elem)
- return style_dict
- def extract_styles(doc_path: Path) -> list[dict]:
- doc = Document(str(doc_path))
- styles_data = []
- for style in doc.styles:
- info = {
- "name": style.name,
- "style_id": style.style_id,
- "type": str(style.type),
- "builtin": style.builtin,
- "hidden": style.hidden,
- "quick_style": style.quick_style,
- "priority": style.priority,
- "base_style": (
- style.base_style.name
- if hasattr(style, "base_style") and style.base_style
- else None
- ),
- "next_paragraph_style": (
- style.next_paragraph_style.name
- if hasattr(style, "next_paragraph_style") and style.next_paragraph_style
- else None
- ),
- "font_summary": extract_font(style), # 便捷摘要
- "paragraph_format_summary": extract_paragraph_format(style), # 便捷摘要
- "full_xml_definition": extract_style_full_xml(style) # 完整原始定义
- }
- styles_data.append(info)
- return styles_data
- def main():
- print(f"读取文件: {DOC_PATH}")
- if not DOC_PATH.exists():
- raise FileNotFoundError(f"文件不存在: {DOC_PATH}")
- styles_data = extract_styles(DOC_PATH)
- result = {
- "source_file": DOC_PATH.name,
- "total_styles": len(styles_data),
- "styles": styles_data,
- }
- OUTPUT_PATH.write_text(
- json.dumps(result, ensure_ascii=False, indent=2),
- encoding="utf-8",
- )
- print(f"共提取 {len(styles_data)} 个样式")
- print(f"完整 XML 定义已保存至: {OUTPUT_PATH}")
- # 打印摘要
- by_type: dict[str, list[str]] = {}
- for s in styles_data:
- t = s["type"]
- by_type.setdefault(t, []).append(s["name"])
- print("\n--- 样式类型分布 ---")
- for t, names in by_type.items():
- print(f" {t}: {len(names)} 个")
- if __name__ == "__main__":
- main()
|