chensiyu
/
ax-backend


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
							"""
提取 Word 文档中所有样式的完整 XML 定义（不遗漏任何属性）。
目标文件: 2026年Q2季度报告_E-EdHk4r1Lg.doc
"""

import json
from pathlib import Path
from docx import Document
from docx.oxml.ns import qn
from lxml import etree

DOC_PATH = Path(__file__).parent / "default.docx"
OUTPUT_PATH = Path(__file__).parent / "default.json"


def emu_to_pt(emu) -> float | None:
    """EMU 转磅（1 pt = 12700 EMU）"""
    if emu is None:
        return None
    return round(int(emu) / 12700, 2)


def extract_font(style) -> dict | None:
    """通过 API 提取字体信息（作为便捷摘要）"""
    try:
        f = style.font
        color_rgb = None
        try:
            if f.color and f.color.type:
                color_rgb = str(f.color.rgb)
        except Exception:
            pass
        return {
            "name": f.name,
            "size_pt": emu_to_pt(f.size),
            "bold": f.bold,
            "italic": f.italic,
            "underline": f.underline,
            "color_rgb": color_rgb,
            "strike": f.strike,
            "all_caps": f.all_caps,
            "small_caps": f.small_caps,
        }
    except Exception:
        return None


def extract_paragraph_format(style) -> dict | None:
    """通过 API 提取段落格式（作为便捷摘要）"""
    try:
        pf = style.paragraph_format
        return {
            "alignment": str(pf.alignment) if pf.alignment else None,
            "left_indent_pt": emu_to_pt(pf.left_indent),
            "right_indent_pt": emu_to_pt(pf.right_indent),
            "first_line_indent_pt": emu_to_pt(pf.first_line_indent),
            "space_before_pt": emu_to_pt(pf.space_before),
            "space_after_pt": emu_to_pt(pf.space_after),
            "line_spacing": float(pf.line_spacing) if pf.line_spacing else None,
            "keep_together": pf.keep_together,
            "keep_with_next": pf.keep_with_next,
            "page_break_before": pf.page_break_before,
        }
    except Exception:
        return None


def element_to_dict(elem: etree._Element) -> dict | list | str | None:
    """
    将 lxml Element 转换为字典，完整保留标签、属性、文本和子元素。
    处理重复子元素（转为列表）。
    使用 '{namespace}localname' 格式作为标签名。
    """
    # 标签名（完整 Clark 表示法）
    tag = elem.tag

    # 属性字典
    attrib = dict(elem.attrib)

    # 子元素处理
    children = list(elem)
    if children:
        # 子元素可能重复，使用字典存储列表
        child_dict = {}
        for child in children:
            child_tag = child.tag
            child_val = element_to_dict(child)
            if child_tag in child_dict:
                # 相同标签名出现多次，转为列表
                if not isinstance(child_dict[child_tag], list):
                    child_dict[child_tag] = [child_dict[child_tag]]
                child_dict[child_tag].append(child_val)
            else:
                child_dict[child_tag] = child_val
        # 合并文本：如果存在文本（非空白），作为 '#text' 字段
        text = elem.text.strip() if elem.text else None
        tail = elem.tail.strip() if elem.tail else None
        result = {"@tag": tag, "@attrib": attrib, "@children": child_dict}
        if text:
            result["#text"] = text
        if tail:
            result["#tail"] = tail
        return result
    else:
        # 叶子节点：直接返回文本或属性+文本
        text = elem.text.strip() if elem.text else None
        tail = elem.tail.strip() if elem.tail else None
        if attrib or text or tail:
            result = {"@tag": tag, "@attrib": attrib}
            if text:
                result["#text"] = text
            if tail:
                result["#tail"] = tail
            return result
        else:
            # 完全空的元素，可简化为 None 但保持结构
            return {"@tag": tag, "@attrib": attrib}


def extract_style_full_xml(style) -> dict:
    """提取样式的完整 XML 定义（转换为字典）"""
    elem = style.element
    if elem is None:
        return None
    # 整个样式元素转换为字典
    style_dict = element_to_dict(elem)
    return style_dict


def extract_styles(doc_path: Path) -> list[dict]:
    doc = Document(str(doc_path))
    styles_data = []

    for style in doc.styles:
        info = {
            "name": style.name,
            "style_id": style.style_id,
            "type": str(style.type),
            "builtin": style.builtin,
            "hidden": style.hidden,
            "quick_style": style.quick_style,
            "priority": style.priority,
            "base_style": (
                style.base_style.name
                if hasattr(style, "base_style") and style.base_style
                else None
            ),
            "next_paragraph_style": (
                style.next_paragraph_style.name
                if hasattr(style, "next_paragraph_style") and style.next_paragraph_style
                else None
            ),
            "font_summary": extract_font(style),      # 便捷摘要
            "paragraph_format_summary": extract_paragraph_format(style),  # 便捷摘要
            "full_xml_definition": extract_style_full_xml(style)   # 完整原始定义
        }
        styles_data.append(info)

    return styles_data


def main():
    print(f"读取文件: {DOC_PATH}")
    if not DOC_PATH.exists():
        raise FileNotFoundError(f"文件不存在: {DOC_PATH}")

    styles_data = extract_styles(DOC_PATH)

    result = {
        "source_file": DOC_PATH.name,
        "total_styles": len(styles_data),
        "styles": styles_data,
    }

    OUTPUT_PATH.write_text(
        json.dumps(result, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    print(f"共提取 {len(styles_data)} 个样式")
    print(f"完整 XML 定义已保存至: {OUTPUT_PATH}")

    # 打印摘要
    by_type: dict[str, list[str]] = {}
    for s in styles_data:
        t = s["type"]
        by_type.setdefault(t, []).append(s["name"])

    print("\n--- 样式类型分布 ---")
    for t, names in by_type.items():
        print(f"  {t}: {len(names)} 个")


if __name__ == "__main__":
    main()