styles.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. 提取 Word 文档中所有样式的完整 XML 定义(不遗漏任何属性)。
  3. 目标文件: 2026年Q2季度报告_E-EdHk4r1Lg.doc
  4. """
  5. import json
  6. from pathlib import Path
  7. from docx import Document
  8. from docx.oxml.ns import qn
  9. from lxml import etree
  10. DOC_PATH = Path(__file__).parent / "default.docx"
  11. OUTPUT_PATH = Path(__file__).parent / "default.json"
  12. def emu_to_pt(emu) -> float | None:
  13. """EMU 转磅(1 pt = 12700 EMU)"""
  14. if emu is None:
  15. return None
  16. return round(int(emu) / 12700, 2)
  17. def extract_font(style) -> dict | None:
  18. """通过 API 提取字体信息(作为便捷摘要)"""
  19. try:
  20. f = style.font
  21. color_rgb = None
  22. try:
  23. if f.color and f.color.type:
  24. color_rgb = str(f.color.rgb)
  25. except Exception:
  26. pass
  27. return {
  28. "name": f.name,
  29. "size_pt": emu_to_pt(f.size),
  30. "bold": f.bold,
  31. "italic": f.italic,
  32. "underline": f.underline,
  33. "color_rgb": color_rgb,
  34. "strike": f.strike,
  35. "all_caps": f.all_caps,
  36. "small_caps": f.small_caps,
  37. }
  38. except Exception:
  39. return None
  40. def extract_paragraph_format(style) -> dict | None:
  41. """通过 API 提取段落格式(作为便捷摘要)"""
  42. try:
  43. pf = style.paragraph_format
  44. return {
  45. "alignment": str(pf.alignment) if pf.alignment else None,
  46. "left_indent_pt": emu_to_pt(pf.left_indent),
  47. "right_indent_pt": emu_to_pt(pf.right_indent),
  48. "first_line_indent_pt": emu_to_pt(pf.first_line_indent),
  49. "space_before_pt": emu_to_pt(pf.space_before),
  50. "space_after_pt": emu_to_pt(pf.space_after),
  51. "line_spacing": float(pf.line_spacing) if pf.line_spacing else None,
  52. "keep_together": pf.keep_together,
  53. "keep_with_next": pf.keep_with_next,
  54. "page_break_before": pf.page_break_before,
  55. }
  56. except Exception:
  57. return None
  58. def element_to_dict(elem: etree._Element) -> dict | list | str | None:
  59. """
  60. 将 lxml Element 转换为字典,完整保留标签、属性、文本和子元素。
  61. 处理重复子元素(转为列表)。
  62. 使用 '{namespace}localname' 格式作为标签名。
  63. """
  64. # 标签名(完整 Clark 表示法)
  65. tag = elem.tag
  66. # 属性字典
  67. attrib = dict(elem.attrib)
  68. # 子元素处理
  69. children = list(elem)
  70. if children:
  71. # 子元素可能重复,使用字典存储列表
  72. child_dict = {}
  73. for child in children:
  74. child_tag = child.tag
  75. child_val = element_to_dict(child)
  76. if child_tag in child_dict:
  77. # 相同标签名出现多次,转为列表
  78. if not isinstance(child_dict[child_tag], list):
  79. child_dict[child_tag] = [child_dict[child_tag]]
  80. child_dict[child_tag].append(child_val)
  81. else:
  82. child_dict[child_tag] = child_val
  83. # 合并文本:如果存在文本(非空白),作为 '#text' 字段
  84. text = elem.text.strip() if elem.text else None
  85. tail = elem.tail.strip() if elem.tail else None
  86. result = {"@tag": tag, "@attrib": attrib, "@children": child_dict}
  87. if text:
  88. result["#text"] = text
  89. if tail:
  90. result["#tail"] = tail
  91. return result
  92. else:
  93. # 叶子节点:直接返回文本或属性+文本
  94. text = elem.text.strip() if elem.text else None
  95. tail = elem.tail.strip() if elem.tail else None
  96. if attrib or text or tail:
  97. result = {"@tag": tag, "@attrib": attrib}
  98. if text:
  99. result["#text"] = text
  100. if tail:
  101. result["#tail"] = tail
  102. return result
  103. else:
  104. # 完全空的元素,可简化为 None 但保持结构
  105. return {"@tag": tag, "@attrib": attrib}
  106. def extract_style_full_xml(style) -> dict:
  107. """提取样式的完整 XML 定义(转换为字典)"""
  108. elem = style.element
  109. if elem is None:
  110. return None
  111. # 整个样式元素转换为字典
  112. style_dict = element_to_dict(elem)
  113. return style_dict
  114. def extract_styles(doc_path: Path) -> list[dict]:
  115. doc = Document(str(doc_path))
  116. styles_data = []
  117. for style in doc.styles:
  118. info = {
  119. "name": style.name,
  120. "style_id": style.style_id,
  121. "type": str(style.type),
  122. "builtin": style.builtin,
  123. "hidden": style.hidden,
  124. "quick_style": style.quick_style,
  125. "priority": style.priority,
  126. "base_style": (
  127. style.base_style.name
  128. if hasattr(style, "base_style") and style.base_style
  129. else None
  130. ),
  131. "next_paragraph_style": (
  132. style.next_paragraph_style.name
  133. if hasattr(style, "next_paragraph_style") and style.next_paragraph_style
  134. else None
  135. ),
  136. "font_summary": extract_font(style), # 便捷摘要
  137. "paragraph_format_summary": extract_paragraph_format(style), # 便捷摘要
  138. "full_xml_definition": extract_style_full_xml(style) # 完整原始定义
  139. }
  140. styles_data.append(info)
  141. return styles_data
  142. def main():
  143. print(f"读取文件: {DOC_PATH}")
  144. if not DOC_PATH.exists():
  145. raise FileNotFoundError(f"文件不存在: {DOC_PATH}")
  146. styles_data = extract_styles(DOC_PATH)
  147. result = {
  148. "source_file": DOC_PATH.name,
  149. "total_styles": len(styles_data),
  150. "styles": styles_data,
  151. }
  152. OUTPUT_PATH.write_text(
  153. json.dumps(result, ensure_ascii=False, indent=2),
  154. encoding="utf-8",
  155. )
  156. print(f"共提取 {len(styles_data)} 个样式")
  157. print(f"完整 XML 定义已保存至: {OUTPUT_PATH}")
  158. # 打印摘要
  159. by_type: dict[str, list[str]] = {}
  160. for s in styles_data:
  161. t = s["type"]
  162. by_type.setdefault(t, []).append(s["name"])
  163. print("\n--- 样式类型分布 ---")
  164. for t, names in by_type.items():
  165. print(f" {t}: {len(names)} 个")
  166. if __name__ == "__main__":
  167. main()