使用python+win32com.client操作word

博主： huoyu
发布时间：2025 年 03 月 12 日
74 次浏览
暂无评论
15879字数
分类：默认分类

> 目的：主要就是将word里面的信息拆分出来

## 单线程去操作是没有问题的

```python
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2025/3/11 16:17
# @Author  : huoyu
# @email   : 2319899766@qq.com
# @File    : tiqiu.py
# @Project : world_pdf
# @Software: PyCharm

######用于提取题目，最重要
import win32com.client #pywin32
import fitz #pymupdf
from PIL import Image #pillow
import time
import os
import time
import re
word_app = win32com.client.gencache.EnsureDispatch('Word.Application')
pattern = r'^\d+[\.、．]'

title_list = ["单项选择题", "多项选择题", "实验题" , "计算题","非选择题","答题前","回答选择题时",'考试结束','选考题','准考证','答题卡'] #防止将题目标题也保存为题目

# 启动 Word
word = win32com.client.Dispatch("Word.Application")
word.Visible = True  # 运行时不显示 Word 窗口
##os.system("taskkill /f /im winword.exe")  # 二次清理
n=1
def extract_questions(input_file, output_folder="output"):
    try:
        doc = word.Documents.Open(os.path.abspath(input_file))
    except:
        print(input_file+'打开失败')
        return ''

# 遍历完成后，设置所有段落的左缩进为0
    for para in doc.Paragraphs:
        para.Format.LeftIndent = 0

os.makedirs(output_folder, exist_ok=True)
    output_folder = os.path.abspath(output_folder)

question_begin_end = []
    for para in doc.Paragraphs:
        text = para.Range.Text.strip()  # 获取段落文本
        list_format = para.Range.ListFormat
        if list_format.ListType > 0:
            number = list_format.ListString.strip()  # 获取编号并去除空格
            text = number + text
        if re.search(pattern, text):
            if '\r\x07' in text:
                continue
            question_begin_end.append(para.Range.Start)

question_begin_end.append(doc.Content.End)

questions = []
    num = 1
    for i in range(0, (len(question_begin_end) - 1)):
        begin = question_begin_end[i]
        end = question_begin_end[i + 1]
        text = doc.Range(begin, end).Text
        ###print(text)
        biaoji = True

for item in title_list:
            if item in text:
                biaoji=False
                break

if biaoji:
            jihe = (str(num), begin, end)
            questions.append(jihe)

# 处理每道题
    for i, (number, start, end) in enumerate(questions, start=1):

try:
        # 创建新文档
            question_doc = word.Documents.Add()

# 复制粘贴
            doc.Range(start, end).Copy()
            time.sleep(0.5)  # 避免粘贴时 Word 处理不及时
            question_doc.Range().Paste()
            question_doc.Range().PasteAndFormat(16)  # 16 代表保持原始格式,

# 保存 Word 文件
            question_filename = os.path.join(output_folder, f"question_{i}.docx")
            question_doc.SaveAs(question_filename)

#保存 pdf 文件
            #pdf_filename = os.path.join(output_folder, f"question_{i}.pdf")
            #question_doc.SaveAs(pdf_filename, FileFormat=17)
            #time.sleep(2)
            question_doc.Close()
            del question_doc

except:
            print(input_file)
            #time.sleep(2)
            question_doc.Close()
            del question_doc

# 关闭 Word 文档
    doc.Close(False)
    del doc

#source_folder = "E:\物理\参考\其他科目\高中\结果\数学"  # 源文件夹路径
#target_folder = "E:\物理\参考\其他科目\合并\高中\数学"  # 目标文件夹路径

def trans1(source_folder, target_folder):
    global n
    source=os.listdir(source_folder)
    for item in source:
        if item=='广东地方123123':
            continue
        #source_folder_2=os.path.join(source_folder,item)
        #target_folder_2=os.path.join(target_folder,item)
        source_folder_2=source_folder
        target_folder_2=target_folder
        os.makedirs(target_folder_2, exist_ok=True)
        files = os.listdir(source_folder_2)
        for item2 in files:
            file_path=os.path.join(source_folder_2,item2)
            output_folder=os.path.join(target_folder_2,item2.split('.')[0])
            os.makedirs(output_folder, exist_ok=True)
            extract_questions(file_path,
                      output_folder)
            print(file_path)
            print(n)
            n=n+1

source1=r".\shijuan"
target1=r'.\shijuan_over'
l1=os.listdir(source1)
for item in l1:
    source_folder=os.path.join(source1,item)
    target_folder=os.path.join(target1,item)
    #print(123123)
    try:
        trans1(source_folder, target_folder)
    except:
        print(source_folder)

word.Quit()  # 确保 Word 进程关闭
```

## 多线程后就会有何种问题

我后面都以为无法进行多线程操作了，快要放弃了！！！，最后一不小心居然可以了，当然，也找到了原因所在。

首先就是。操作word用的是wps的一个组件吧，，每次打开wps都会只有一个进程，那么想多线程`win32com.client.Dispatch("Word.Application")`去操作一个进程，测试结果就是，不行！那么多线程不行那我用多进程呢?于是我在想创建的时候应该还有一个方法，果不其然，发现了一个`win32com.client.DispatchEX("Word.Application")`里面的`DispatchEX`这个就是在创建一个wps处理的进程，所以得出的结论就是使用多进程去操作。

按照这个方法我写完了代码。但是测试时遇到一个问题`pywintypes.com_error: (-2147221008, ‘尚未调用 CoInitialize。‘, None, None)`或`失败 (-2147023174, 'RPC 服务器不可用。', None, None)`或`失败 (-2147220995, '对象没有连接到服务器', None, None)`那像这种离谱的问题，我找了一下午的原因。

## 主要原因

`pythoncom.CoInitialize()`这个的使用位置尤为关键。

网友1：只需要创建一个`word = win32com.client.DispatchEx("Word.Application")`，后面操作doc就用多线程去操作就行了，然后`pythoncom.CoInitialize()`添加到线程里面就可以了

网友2:每个进程单独创建`word = win32com.client.DispatchEx("Word.Application")`，在创建完成后加入`pythoncom.CoInitialize()`代码，最后的位置要加入`pythoncom.CoUninitialize()`。

还有网友3：....

看了很多文章，最后我要放弃了，后面是chatgpt回答的  ：：：主要出现在 **Word 进程管理** 和 **多进程并发操作 COM 对象** 上，导致  **RPC 服务器不可用** （`(-2147023174, 'RPC 服务器不可用。', None, None)`）和  **对象没有连接到服务器** （`(-2147220995, '对象没有连接到服务器'`）。

`COM`这个问题我在易语言里面也用过，就是控制线程代码的进和出保持稳定。

于是乎，我将这两行代码，一直更换位置，最后发现能跑了，但是还是有问题，比如我开3个或者5个进程，跑着跑着，哦豁，又只剩一个进程了，那其他的进程在干嘛呢，卡住了，而且启动的进程也没有关闭成功，虽然代码加了try，但是重新创建进程，就会发现，即使创建成功了，到了调用的时候也不行，估计是因为有一个进程一直在使用，而com这个处理方式没对，所以就出现了不稳定的情况。

最后我在放弃，要关闭文章的时候，看到[文章](https://blog.csdn.net/qq_15557299/article/details/112645073)他说`然后在使用pywin32之前的程序开头加上pythoncom.CoInitialize()这个就行了`，于是我吧`pythoncom.CoUninitialize()`删除了，再次运行，发现，我操！！！！可以了，原来这个com的解决办法就是在开头加就行了，而不是最后还要加一个结束的代码，丢你老母哦。

## 最后的代码

```python
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2024/9/12 09:14
# @Author  : huoyu
# @email   : 2319899766@qq.com
# @File    : ve_thread.py
# @Project : verifi_number
# @Software: PyCharm
from time import sleep

######用于提取题目，最重要
import win32com.client  # pywin32
import re
import configparser
import threading
import multiprocessing
import os
import pythoncom
from ColorInfo import ColorLogger
import fitz #pymupdf
from PIL import Image #pillow

# 读取配置文件
config = configparser.ConfigParser()
config.read('config.ini')
logger = ColorLogger()
threads_count = int(config['Settings']['threads'])
processes_count = int(config['Settings']['processes'])

number_pattern = r'^\d+[\.、．]' # 匹配题号
title_list = ["单项选择题", "多项选择题", "实验题", "计算题", "非选择题", "答题前", "回答选择题时", '考试结束',
              '选考题', '准考证', '答题卡']  # 防止将题目标题也保存为题目

# number_pattern = re.compile(r'^\d+$') # 匹配题号
# title_list = ["一、单项选择题", "二、多项选择题", "三、实验题：" , "四、计算题："] #防止将题目标题也保存为题目

def pdf_to_long_image(pdf_path, output_path=None, zoom=2.0):
    """将 PDF 文件转换为一张长图, 并保存到 output_path"""
    doc = fitz.open(pdf_path)  # 打开 PDF
    images = []

# 遍历 PDF 的每一页
    for page in doc:
        mat = fitz.Matrix(zoom, zoom)  # 设置缩放
        pix = page.get_pixmap(matrix=mat)  # 渲染页面为图像
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)  # 转换为 PIL 图像
        images.append(img)

if not images:
        print("❌ PDF 没有可用的页面")
        return

# 计算总高度（所有图片的高度总和）
    total_width = max(img.width for img in images)  # 取最宽的作为宽度
    total_height = sum(img.height for img in images)  # 计算总高度

# 创建一张新的空白图像
    long_image = Image.new("RGB", (total_width, total_height), (255, 255, 255))

# 依次粘贴每一页
    y_offset = 0
    for img in images:
        long_image.paste(img, (0, y_offset))
        y_offset += img.height  # 更新粘贴位置
    if output_path is None:
        output_path = pdf_path.replace(".pdf", ".png")
    # 保存最终的长图
    long_image.save(output_path)
    print(f"✅ 长图已保存: {output_path}")

def extract_questions(input_file,multiprocessing_name):
    print(f"##########进来了 {multiprocessing_name}################")

try:
        pythoncom.CoInitialize()  # 初始化 COM 环境
        # 启动 Word
        word = win32com.client.DispatchEx("Word.Application")
        word.Visible = False  # 运行时不显示 Word 窗口
        word.DisplayAlerts = 0  # 不显示，不警告

doc = word.Documents.Open(os.path.abspath(input_file))
    except Exception as e:
        print(input_file + '打开失败',e)
        return ''
    print(f"##########进来了1 {multiprocessing_name}################")
    try:
        # 遍历完成后，设置所有段落的左缩进为0
        # sleep(8)##恶心的地方！！！！！！
        # print(len(doc.Paragraphs))
        # for para in doc.Paragraphs:
        #     para.Format.LeftIndent = 0
        output_folder=input_file.replace("shijuan","shijuan_over").replace(".doc","").replace(".docx","")
        print(f"##########进来了2 {multiprocessing_name}################")
        os.makedirs(output_folder, exist_ok=True)
        output_folder = os.path.abspath(output_folder)

question_begin_end = []
        for para in doc.Paragraphs:
            text = para.Range.Text.strip()  # 获取段落文本
            list_format = para.Range.ListFormat
            if list_format.ListType > 0:
                number = list_format.ListString.strip()  # 获取编号并去除空格
                text = number + text
            if re.search(number_pattern, text):
                if '\r\x07' in text:
                    continue
                question_begin_end.append(para.Range.Start)

question_begin_end.append(doc.Content.End)
        print(f"##########进来了3{multiprocessing_name}################")
        questions = []
        num = 1
        for i in range(0, (len(question_begin_end) - 1)):
            begin = question_begin_end[i]
            end = question_begin_end[i + 1]
            text = doc.Range(begin, end).Text
            ###print(text)
            biaoji = True

for item in title_list:
                if item in text:
                    biaoji = False
                    break

if biaoji:
                jihe = (str(num), begin, end)
                questions.append(jihe)
        print(f"##########进来了4 {multiprocessing_name}################")
        # 处理每道题
        for i, (number, start, end) in enumerate(questions, start=1):

# 创建新文档
            question_doc = word.Documents.Add()
            # 复制粘贴
            # doc.Range(start, end).Copy()
            # time.sleep(0.5)  # 避免粘贴时 Word 处理不及时
            # question_doc.Range().Paste()
            # question_doc.Range().PasteAndFormat(16)  # 16 代表保持原始格式,
            # **避免使用剪贴板，直接赋值**
            question_doc.Range().FormattedText = doc.Range(start, end).FormattedText
            # 保存 Word 文件
            question_filename = os.path.join(output_folder, f"question_{i}.docx")
            # question_doc.SaveAs(question_filename)
            question_doc.SaveAs(question_filename, 12, False, "", True, "", False, False, False, False)  # 转化后路径下的文件
            # 保存 pdf 文件
            pdf_filename = os.path.join(output_folder, f"question_{i}.pdf")
            question_doc.SaveAs(pdf_filename, FileFormat=17)
            question_doc.Close()
            print(f"✅ 题目 {i} {question_filename} 转换图片")
            pdf_to_long_image(pdf_filename)
            print(f"✅ 题目 {i} 处理完成: {question_filename}")
        # del question_doc
            # 关闭 Word 文档
        doc.Close(False)
        del doc
    except Exception as e:
        print(f"❌ 处理失败: {e}")
        sleep(5)
        # 关闭 Word 文档
        # doc.Close(False)

finally:
        try:
            if word:
                print("关闭word进程")
                word.Quit()  # 确保 Word 进程关闭
                del word
            # pythoncom.CoUninitialize()  # 释放 COM 环境
            print("✅ 处理完成，Word 已关闭")
        except Exception as e:
            print(e)
    sleep(5)

# 加载任务所需数据
def load_data(data_queue):
    # 从文件中读取数据并将其放入队列
    # with open(file_path, 'r', encoding="utf-8") as file:
    #     for line in file:
    #         data_queue.put(line.strip())  # 去除行末的换行符，并将数据放入队列
    # # 标记数据读取完成
    # data_queue.put(None)

source1 = r".\shijuan"
    target1 = r'.\shijuan_over'
    l1 = os.listdir(source1)
    print("正在加载试卷")
    for item in l1:
        source_folder = os.path.join(source1, item)
        target_folder = os.path.join(target1, item)
        # print(source_folder,target_folder)
        try:
            source = os.listdir(source_folder)
            for item in source:
                if item == '广东地方123123':
                    continue
                os.makedirs(target_folder, exist_ok=True)
            for item2 in os.listdir(source_folder):
                file_path = os.path.join(source_folder, item2)
                output_folder = os.path.join(target_folder, item2.split('.')[0])
                # os.makedirs(output_folder, exist_ok=True)#先创建保存文件夹
                # print(file_path,output_folder)
                data_queue.put(file_path)
        except Exception as e:
            print(source_folder, e.__traceback__.tb_lineno, e)
    # 标记数据读取完成
    data_queue.put(None)
    print("试卷加载完成")

def task(data_queue,multiprocessing_name):
    while True:
        data = data_queue.get()
        if data is None:  # 读取到None标记时退出
            data_queue.put(None)  # 传递None到队列，让其他线程也能退出
            break
        sleep(4)
        print(multiprocessing_name+f"处理数据: {data}")
        try:
            extract_questions(data,multiprocessing_name)
        except Exception as e:
            print("66666666666666666666666666666666")
        print(multiprocessing_name+f"数据处理完成: {data}")

# def worker_process(data_queue):
#     print(f"进程 {multiprocessing.current_process().name} 启动")
#     # threads = []
#     # for _ in range(threads_count):
#     #     thread = threading.Thread(target=task, args=(data_queue))
#     #     threads.append(thread)
#     #     thread.start()
#
#     # for thread in threads:
#     #     thread.join()
#
#     task(data_queue)
#
#
#     print(f"进程 {multiprocessing.current_process().name} 完成任务")

def worker_process(data_queue):
    print(f"进程 {multiprocessing.current_process().name} 启动")
    try:
        task(data_queue,multiprocessing.current_process().name)
    finally:
        print(f"进程 {multiprocessing.current_process().name} 完成任务")

if __name__ == '__main__':
    manager = multiprocessing.Manager()
    data_queue = manager.Queue()  # 创建一个共享队列
    process_event = multiprocessing.Event()  # 创建一个事件标志
    processes = []

# 启动数据加载线程
    data_loader_thread = threading.Thread(target=load_data, args=(data_queue,))
    data_loader_thread.start()
    # 等待数据加载线程完成
    data_loader_thread.join()

for _ in range(processes_count):
        process = multiprocessing.Process(target=worker_process, args=(data_queue, ))
        processes.append(process)
        process.start()

# 等待所有进程完成
    for process in processes:
        process.join()

# 所有进程完成后设置事件标志
    process_event.set()

print("程序结束")

```

文章目录结构

![image.png](http://type.zimopy.com/usr/uploads/2025/03/454462442.png)

## config.ini

```ya
[Settings]
threads = 1
processes = 3
```

随便上传几个处理文件以供参考

[2008年高考化学试卷（上海）（解析卷）.doc](undefined)
[2008年高考化学试卷（全国卷Ⅰ）（解析卷）.doc](undefined)
[2008年高考化学试卷（全国卷Ⅱ）（解析卷）.doc](undefined)
[2008年高考化学试卷（北京）（解析卷）.doc](undefined)
[2008年高考化学试卷（四川）（解析卷）.doc](undefined)
[2008年高考化学试卷（天津）（解析卷）.doc](undefined)
[2008年高考化学试卷（山东）（解析卷）.doc](undefined)
[2008年高考化学试卷（广东）（解析卷）.doc](undefined)

## 总结

win32com.client的多进程使用真的太坑了，而且网上的解决方案太少了且作用不对，很多人都差那么一点儿就成功了。

最后修改：2025 年 03 月 12 日

如果觉得我的文章对你有用，请随意赞赏

发表评论取消回复
使用cookie技术保留您的个人信息以便您下次快速评论，继续评论表示您已同意该条款

评论 *

私密评论

名称 *

🎲

邮箱

地址

huoyu1
真的很需要
欣
很牛逼|´・ω・)ノ
hhh
真棒
Hongjia
https://blog.dingtone.me/zh/rec...
Hongjia
https://5sim.net/zh使用该网站可以最低1 卢布充值

使用python+win32com.client操作word

huoyu • 2025 年 03 月 12 日