242 lines
7.6 KiB
Python
242 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
md_to_docx.py — Convert a directory of Markdown files into a single Word document.
|
|
|
|
Structure: title page → auto-updating TOC → changelog → one section per .md file.
|
|
|
|
Usage:
|
|
python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE]
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from docx import Document
|
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
|
from docx.oxml import OxmlElement
|
|
from docx.oxml.ns import qn
|
|
from docx.shared import Inches, Pt
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Word XML helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _insert_toc_field(doc):
|
|
"""Insert a Word TOC field (Word updates it on first open)."""
|
|
para = doc.add_paragraph()
|
|
run = para.add_run()
|
|
begin = OxmlElement('w:fldChar')
|
|
begin.set(qn('w:fldCharType'), 'begin')
|
|
instr = OxmlElement('w:instrText')
|
|
instr.set(qn('xml:space'), 'preserve')
|
|
instr.text = ' TOC \\o "1-3" \\h \\z \\u '
|
|
sep = OxmlElement('w:fldChar')
|
|
sep.set(qn('w:fldCharType'), 'separate')
|
|
end = OxmlElement('w:fldChar')
|
|
end.set(qn('w:fldCharType'), 'end')
|
|
run._r.extend([begin, instr, sep, end])
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Inline markdown → Word runs
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_INLINE_RE = re.compile(
|
|
r'(\*\*\*[^*\n]+\*\*\*' # bold-italic ***…***
|
|
r'|\*\*[^*\n]+\*\*' # bold **…**
|
|
r'|\*[^*\n]+\*' # italic *…*
|
|
r'|`[^`\n]+`)' # inline code `…`
|
|
)
|
|
|
|
|
|
def _add_inline(paragraph, text):
|
|
for part in _INLINE_RE.split(text):
|
|
if not part:
|
|
continue
|
|
if part.startswith('***') and part.endswith('***'):
|
|
r = paragraph.add_run(part[3:-3])
|
|
r.bold = r.italic = True
|
|
elif part.startswith('**') and part.endswith('**'):
|
|
r = paragraph.add_run(part[2:-2])
|
|
r.bold = True
|
|
elif part.startswith('*') and part.endswith('*'):
|
|
r = paragraph.add_run(part[1:-1])
|
|
r.italic = True
|
|
elif part.startswith('`') and part.endswith('`'):
|
|
r = paragraph.add_run(part[1:-1])
|
|
r.font.name = 'Courier New'
|
|
r.font.size = Pt(10)
|
|
else:
|
|
paragraph.add_run(part)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Markdown block parser → Word paragraphs
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _md_to_doc(doc, content):
|
|
lines = content.splitlines()
|
|
i = 0
|
|
in_fence = False
|
|
fence_buf = []
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Fenced code block
|
|
if line.startswith('```'):
|
|
if not in_fence:
|
|
in_fence = True
|
|
fence_buf = []
|
|
else:
|
|
in_fence = False
|
|
p = doc.add_paragraph('\n'.join(fence_buf))
|
|
if p.runs:
|
|
p.runs[0].font.name = 'Courier New'
|
|
p.runs[0].font.size = Pt(9)
|
|
p.paragraph_format.space_after = Pt(6)
|
|
i += 1
|
|
continue
|
|
|
|
if in_fence:
|
|
fence_buf.append(line)
|
|
i += 1
|
|
continue
|
|
|
|
# ATX headings
|
|
m = re.match(r'^(#{1,6})\s+(.*)', line)
|
|
if m:
|
|
doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6))
|
|
i += 1
|
|
continue
|
|
|
|
# Blank line
|
|
if not line.strip():
|
|
i += 1
|
|
continue
|
|
|
|
# Horizontal rule
|
|
if re.match(r'^[*\-_]{3,}\s*$', line):
|
|
p = doc.add_paragraph('─' * 60)
|
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
i += 1
|
|
continue
|
|
|
|
# Blockquote
|
|
if line.startswith('> '):
|
|
p = doc.add_paragraph(style='Quote')
|
|
_add_inline(p, line[2:])
|
|
i += 1
|
|
continue
|
|
|
|
# Unordered list
|
|
m = re.match(r'^\s*[-*+]\s+(.*)', line)
|
|
if m:
|
|
p = doc.add_paragraph(style='List Bullet')
|
|
_add_inline(p, m.group(1))
|
|
i += 1
|
|
continue
|
|
|
|
# Ordered list
|
|
m = re.match(r'^\s*\d+\.\s+(.*)', line)
|
|
if m:
|
|
p = doc.add_paragraph(style='List Number')
|
|
_add_inline(p, m.group(1))
|
|
i += 1
|
|
continue
|
|
|
|
# Default paragraph
|
|
p = doc.add_paragraph()
|
|
_add_inline(p, line)
|
|
i += 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Document assembly
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_docx(md_dir: Path, output: Path, title: str, logo: Path | None = None):
|
|
doc = Document()
|
|
|
|
# Title page
|
|
if logo is not None:
|
|
logo_para = doc.add_paragraph()
|
|
logo_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
logo_para.add_run().add_picture(str(logo), width=Inches(2))
|
|
|
|
heading = doc.add_heading(title, level=0)
|
|
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}')
|
|
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
doc.add_page_break()
|
|
|
|
# Table of Contents
|
|
doc.add_heading('Table of Contents', level=1)
|
|
_insert_toc_field(doc)
|
|
doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)')
|
|
doc.add_page_break()
|
|
|
|
# Changelog — always first content section
|
|
doc.add_heading('Changelog', level=1)
|
|
changelog = next(
|
|
(md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md')
|
|
if (md_dir / name).exists()),
|
|
None
|
|
)
|
|
if changelog:
|
|
_md_to_doc(doc, changelog.read_text(encoding='utf-8'))
|
|
else:
|
|
doc.add_paragraph('No changelog file found.')
|
|
doc.add_page_break()
|
|
|
|
# Remaining .md files sorted alphabetically
|
|
skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'}
|
|
md_files = sorted(
|
|
[f for f in md_dir.glob('*.md') if f.name not in skip],
|
|
key=lambda f: f.name.lower()
|
|
)
|
|
|
|
for md_file in md_files:
|
|
section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title()
|
|
doc.add_heading(section_title, level=1)
|
|
_md_to_doc(doc, md_file.read_text(encoding='utf-8'))
|
|
doc.add_page_break()
|
|
|
|
doc.save(output)
|
|
print(f'Saved → {output}')
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Convert a folder of .md files into a single Word .docx'
|
|
)
|
|
parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)')
|
|
parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path')
|
|
parser.add_argument('--title', '-t', default='Project Documentation', help='Document title')
|
|
parser.add_argument('--logo', '-l', default=None, help='Path to logo image (PNG/JPG) for the title page')
|
|
args = parser.parse_args()
|
|
|
|
md_dir = Path(args.dir).resolve()
|
|
if not md_dir.is_dir():
|
|
sys.exit(f'Error: {md_dir} is not a directory')
|
|
|
|
logo_path = None
|
|
if args.logo:
|
|
logo_path = Path(args.logo).resolve()
|
|
if not logo_path.is_file():
|
|
sys.exit(f'Error: logo file not found: {logo_path}')
|
|
|
|
build_docx(md_dir, Path(args.output), args.title, logo=logo_path)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|