""" 提取 Word 文档中所有样式的完整 XML 定义(不遗漏任何属性)。 目标文件: 2026年Q2季度报告_E-EdHk4r1Lg.doc """ import json from pathlib import Path from docx import Document from docx.oxml.ns import qn from lxml import etree DOC_PATH = Path(__file__).parent / "default.docx" OUTPUT_PATH = Path(__file__).parent / "default.json" def emu_to_pt(emu) -> float | None: """EMU 转磅(1 pt = 12700 EMU)""" if emu is None: return None return round(int(emu) / 12700, 2) def extract_font(style) -> dict | None: """通过 API 提取字体信息(作为便捷摘要)""" try: f = style.font color_rgb = None try: if f.color and f.color.type: color_rgb = str(f.color.rgb) except Exception: pass return { "name": f.name, "size_pt": emu_to_pt(f.size), "bold": f.bold, "italic": f.italic, "underline": f.underline, "color_rgb": color_rgb, "strike": f.strike, "all_caps": f.all_caps, "small_caps": f.small_caps, } except Exception: return None def extract_paragraph_format(style) -> dict | None: """通过 API 提取段落格式(作为便捷摘要)""" try: pf = style.paragraph_format return { "alignment": str(pf.alignment) if pf.alignment else None, "left_indent_pt": emu_to_pt(pf.left_indent), "right_indent_pt": emu_to_pt(pf.right_indent), "first_line_indent_pt": emu_to_pt(pf.first_line_indent), "space_before_pt": emu_to_pt(pf.space_before), "space_after_pt": emu_to_pt(pf.space_after), "line_spacing": float(pf.line_spacing) if pf.line_spacing else None, "keep_together": pf.keep_together, "keep_with_next": pf.keep_with_next, "page_break_before": pf.page_break_before, } except Exception: return None def element_to_dict(elem: etree._Element) -> dict | list | str | None: """ 将 lxml Element 转换为字典,完整保留标签、属性、文本和子元素。 处理重复子元素(转为列表)。 使用 '{namespace}localname' 格式作为标签名。 """ # 标签名(完整 Clark 表示法) tag = elem.tag # 属性字典 attrib = dict(elem.attrib) # 子元素处理 children = list(elem) if children: # 子元素可能重复,使用字典存储列表 child_dict = {} for child in children: child_tag = child.tag child_val = element_to_dict(child) if child_tag in child_dict: # 相同标签名出现多次,转为列表 if not isinstance(child_dict[child_tag], list): child_dict[child_tag] = [child_dict[child_tag]] child_dict[child_tag].append(child_val) else: child_dict[child_tag] = child_val # 合并文本:如果存在文本(非空白),作为 '#text' 字段 text = elem.text.strip() if elem.text else None tail = elem.tail.strip() if elem.tail else None result = {"@tag": tag, "@attrib": attrib, "@children": child_dict} if text: result["#text"] = text if tail: result["#tail"] = tail return result else: # 叶子节点:直接返回文本或属性+文本 text = elem.text.strip() if elem.text else None tail = elem.tail.strip() if elem.tail else None if attrib or text or tail: result = {"@tag": tag, "@attrib": attrib} if text: result["#text"] = text if tail: result["#tail"] = tail return result else: # 完全空的元素,可简化为 None 但保持结构 return {"@tag": tag, "@attrib": attrib} def extract_style_full_xml(style) -> dict: """提取样式的完整 XML 定义(转换为字典)""" elem = style.element if elem is None: return None # 整个样式元素转换为字典 style_dict = element_to_dict(elem) return style_dict def extract_styles(doc_path: Path) -> list[dict]: doc = Document(str(doc_path)) styles_data = [] for style in doc.styles: info = { "name": style.name, "style_id": style.style_id, "type": str(style.type), "builtin": style.builtin, "hidden": style.hidden, "quick_style": style.quick_style, "priority": style.priority, "base_style": ( style.base_style.name if hasattr(style, "base_style") and style.base_style else None ), "next_paragraph_style": ( style.next_paragraph_style.name if hasattr(style, "next_paragraph_style") and style.next_paragraph_style else None ), "font_summary": extract_font(style), # 便捷摘要 "paragraph_format_summary": extract_paragraph_format(style), # 便捷摘要 "full_xml_definition": extract_style_full_xml(style) # 完整原始定义 } styles_data.append(info) return styles_data def main(): print(f"读取文件: {DOC_PATH}") if not DOC_PATH.exists(): raise FileNotFoundError(f"文件不存在: {DOC_PATH}") styles_data = extract_styles(DOC_PATH) result = { "source_file": DOC_PATH.name, "total_styles": len(styles_data), "styles": styles_data, } OUTPUT_PATH.write_text( json.dumps(result, ensure_ascii=False, indent=2), encoding="utf-8", ) print(f"共提取 {len(styles_data)} 个样式") print(f"完整 XML 定义已保存至: {OUTPUT_PATH}") # 打印摘要 by_type: dict[str, list[str]] = {} for s in styles_data: t = s["type"] by_type.setdefault(t, []).append(s["name"]) print("\n--- 样式类型分布 ---") for t, names in by_type.items(): print(f" {t}: {len(names)} 个") if __name__ == "__main__": main()