According to the architecture of the DOCX document:
- Text: docx>Paragraphs>runs
- Text table: docx>tables>rows>cells>Paragraphs>runs
- Header: docx>sections>header>Paragraphs>runs
- Header tables: docx>sections>header>tables>row>cells>Paragraphs>runs
The footer is the same as the header, we can directly traverse the paragraph to find and replace our keywords, but this will cause the text format to be reset, so we can only traverse the words in the run and replace them. However, as our keywords may exceed the length range of the run, we cannot replace them successfully.
Therefore, I provide an idea here: firstly, take paragraph as unit, and mark the position of every character in paragraph through list; then, mark the position of every character in run through list; find keywords in paragraph, delete and replace them by character as unit by corresponding relation.
'''
-*- coding: utf-8 -*-
@Time : 2021/4/19 13:13
@Author : ZCG
@Site :
@File : Batch DOCX document keyword replacement.py
@Software: PyCharm
'''
from docx import Document
import os
def get_docx_list(dir_path):
'''
:param dir_path:
:return: List of docx files in the current directory
'''
file_list = []
for roots,dirs,files in os.walk(dir_path):
for file in files:
if file.endswith("docx") == True and file[0] != "~": # Locate the docx document and exclude temporary files
file_root = roots+"\\"+file
file_list.append(file_root)
print("The directory found a total of {0} related files!".format(len(file_list)))
return file_list
class ParagraphsKeyWordsReplace:
'''
self:paragraph
'''
def __init__(self):
self.text = None
self.runs = None
def p_replace(self,x,key,value):
'''
The reason why the text in the paragraph is not directly replaced is because this will cause the original format to change.
Replacing the text in the runs will not cause the original format to change.
:param x: paragraph number
:param key: keywords to replace
:param value: replaced keywords
:return:
'''
paragraph_positions = [] # Get the coordinate values of all characters in this paragraph {run_index , char_index}
for y, run in enumerate(self.runs): # Read the index of the run
for z, char in enumerate(list(run.text)): # Read the index of chars in run
position = {"run": y, "char": z} # give each character a dictionary index
paragraph_positions.append(position)
# Process the number of times the key appears in this paragraph, and record the starting position in the list
# Here, if you use while self.text.find(key) >= 0, when you encounter a structural word such as {"ab":"abc"},
# it will enter an infinite loop, and return the index of the first word of the key in the current paragraph value
key_indexs = [s for s in range(len(self.text)) if self.text.find(key, s, len(self.text)) == s]
for i, start_i in enumerate( reversed(key_indexs),start=1): # Iteration in reverse order
end_i = start_i + len(key) # where the keyword ends in this paragraph
key_maps = paragraph_positions[start_i:end_i] # Map the section of the slice list that contains the keyword in the paragraph
ParagraphsKeyWordsReplace.c_replace(self, key_maps, value)
print(f"\tSuccessfully replaced segment {x+1}, object {i}:{key}===>{value}")
def c_replace(self,key_maps,value):
'''
:param key_maps: List of index dictionaries containing keywords
:param value: replaced new word
:return:
Receive parameters, delete the characters in key_maps from back to front, and keep the first one for replacement with value
Note: Be sure to delete in reverse order, otherwise the change in the length of the list will cause IndedxError: string index out of range error
'''
# print(key_maps)
for i, position in enumerate(reversed(key_maps),start=1):
y, z = position["run"], position["char"]
run,char = self.runs[y],self.runs[y].text[z]
# print("current processing:",position,char,i,len(key_maps))
# print("Before:",run.text)
if i < len(key_maps):
rt = list(run.text)
rt.pop(z)
run.text = ''.join(rt) # Delete the character at the specified index each time through the loop
# Stepping on the pit: There is no replace method here. The purpose is to prevent multiple identical words in run.text. If multiple words are replaced at one time, an IndedxError will be raised.
if i == len(key_maps):
run.text = run.text.replace(char, value) # The first character in key_maps is replaced with value
# print("After:", run.text)
class DocxKeyWordsReplace:
'''
self:docx
'''
def __init__(self):
self.paragraphs = None
self.tables = None
self.sections = None
def content(self,replace_dict):
print(f"(1)Processing keywords in body text...")
for key, value in replace_dict.items():
for x, paragraph in enumerate(self.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tText keyword replacement completed!")
def tables(self,replace_dict):
print(f"(2)Processing keywords in table...")
for key,value in replace_dict.items():
for table in self.tables:
for row in table.rows:
for cell in row.cells:
for x,paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tTable keyword replacement completed!")
def header_content(self,replace_dict):
print(f"(3)Processing keywords in header...")
for key,value in replace_dict.items():
for section in self.sections:
for x,paragraph in enumerate(section.header.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tContent header keyword replacement completed!")
def header_tables(self,replace_dict):
print(f"(4)Processing keywords in header table...")
for key,value in replace_dict.items():
for section in self.sections:
for table in section.header.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tHeader table keyword replacement completed!")
def footer_content(self, replace_dict):
print(f"(6)Processing keywords in footer...")
for key, value in replace_dict.items():
for section in self.sections:
for x, paragraph in enumerate(section.footer.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tFooter keyword replacement completed!")
def footer_tables(self, replace_dict):
print(f"(7)Processing keywords in footer table...")
for key, value in replace_dict.items():
for section in self.sections:
for table in section.footer.tables:
for row in table.rows:
for cell in row.cells:
for x, paragraph in enumerate(cell.paragraphs):
ParagraphsKeyWordsReplace.p_replace(paragraph,x,key,value)
print("\tFooter table keyword replacement completed!")
def main():
'''
How to use: Modify the values in replace_dict and file_dir
replace_dict :The following dictionary corresponds to the format, the key is the content to be replaced, and the value is the new content
file_dir :The directory where the docx file is stored, and its subdirectories are supported
'''
# input section
replace_dict = {
"MG life technology (shenzhen) co., LTD":"Shenzhen YW medical technology co., LTD",
"MG-":"YW-",
"2017-":"2020-",
"Z18":"Z20",
}
file_dir = r"E:\docxfiles"
# call processing part
for i,file in enumerate(get_docx_list(file_dir),start=1):
print(f"{i}、file being processed:{file}")
docx = Document(file)
DocxKeyWordsReplace.content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.header_tables(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_content(docx, replace_dict=replace_dict)
DocxKeyWordsReplace.footer_tables(docx, replace_dict=replace_dict)
docx.save(file)
print(f'"{file}"Document processing complete!\n')
if __name__ == "__main__":
main()
print("All complete processing!")