#!/usr/bin/env python3 """ md_to_docx.py — Convert a directory of Markdown files into a single Word document. Structure: title page → auto-updating TOC → changelog → one section per .md file. Usage: python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE] """ import argparse import re import sys from datetime import datetime from pathlib import Path from docx import Document from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Inches, Pt # --------------------------------------------------------------------------- # Word XML helpers # --------------------------------------------------------------------------- def _insert_toc_field(doc): """Insert a Word TOC field (Word updates it on first open).""" para = doc.add_paragraph() run = para.add_run() begin = OxmlElement('w:fldChar') begin.set(qn('w:fldCharType'), 'begin') instr = OxmlElement('w:instrText') instr.set(qn('xml:space'), 'preserve') instr.text = ' TOC \\o "1-3" \\h \\z \\u ' sep = OxmlElement('w:fldChar') sep.set(qn('w:fldCharType'), 'separate') end = OxmlElement('w:fldChar') end.set(qn('w:fldCharType'), 'end') run._r.extend([begin, instr, sep, end]) # --------------------------------------------------------------------------- # Inline markdown → Word runs # --------------------------------------------------------------------------- _INLINE_RE = re.compile( r'(\*\*\*[^*\n]+\*\*\*' # bold-italic ***…*** r'|\*\*[^*\n]+\*\*' # bold **…** r'|\*[^*\n]+\*' # italic *…* r'|`[^`\n]+`)' # inline code `…` ) def _add_inline(paragraph, text): for part in _INLINE_RE.split(text): if not part: continue if part.startswith('***') and part.endswith('***'): r = paragraph.add_run(part[3:-3]) r.bold = r.italic = True elif part.startswith('**') and part.endswith('**'): r = paragraph.add_run(part[2:-2]) r.bold = True elif part.startswith('*') and part.endswith('*'): r = paragraph.add_run(part[1:-1]) r.italic = True elif part.startswith('`') and part.endswith('`'): r = paragraph.add_run(part[1:-1]) r.font.name = 'Courier New' r.font.size = Pt(10) else: paragraph.add_run(part) # --------------------------------------------------------------------------- # Markdown block parser → Word paragraphs # --------------------------------------------------------------------------- def _md_to_doc(doc, content): lines = content.splitlines() i = 0 in_fence = False fence_buf = [] while i < len(lines): line = lines[i] # Fenced code block if line.startswith('```'): if not in_fence: in_fence = True fence_buf = [] else: in_fence = False p = doc.add_paragraph('\n'.join(fence_buf)) if p.runs: p.runs[0].font.name = 'Courier New' p.runs[0].font.size = Pt(9) p.paragraph_format.space_after = Pt(6) i += 1 continue if in_fence: fence_buf.append(line) i += 1 continue # ATX headings m = re.match(r'^(#{1,6})\s+(.*)', line) if m: doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6)) i += 1 continue # Blank line if not line.strip(): i += 1 continue # Horizontal rule if re.match(r'^[*\-_]{3,}\s*$', line): p = doc.add_paragraph('─' * 60) p.alignment = WD_ALIGN_PARAGRAPH.CENTER i += 1 continue # Blockquote if line.startswith('> '): p = doc.add_paragraph(style='Quote') _add_inline(p, line[2:]) i += 1 continue # Unordered list m = re.match(r'^\s*[-*+]\s+(.*)', line) if m: p = doc.add_paragraph(style='List Bullet') _add_inline(p, m.group(1)) i += 1 continue # Ordered list m = re.match(r'^\s*\d+\.\s+(.*)', line) if m: p = doc.add_paragraph(style='List Number') _add_inline(p, m.group(1)) i += 1 continue # Default paragraph p = doc.add_paragraph() _add_inline(p, line) i += 1 # --------------------------------------------------------------------------- # Document assembly # --------------------------------------------------------------------------- def build_docx(md_dir: Path, output: Path, title: str, logo: Path | None = None): doc = Document() # Title page if logo is not None: logo_para = doc.add_paragraph() logo_para.alignment = WD_ALIGN_PARAGRAPH.CENTER logo_para.add_run().add_picture(str(logo), width=Inches(2)) heading = doc.add_heading(title, level=0) heading.alignment = WD_ALIGN_PARAGRAPH.CENTER date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}') date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER doc.add_page_break() # Table of Contents doc.add_heading('Table of Contents', level=1) _insert_toc_field(doc) doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)') doc.add_page_break() # Changelog — always first content section doc.add_heading('Changelog', level=1) changelog = next( (md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md') if (md_dir / name).exists()), None ) if changelog: _md_to_doc(doc, changelog.read_text(encoding='utf-8')) else: doc.add_paragraph('No changelog file found.') doc.add_page_break() # Remaining .md files sorted alphabetically skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'} md_files = sorted( [f for f in md_dir.glob('*.md') if f.name not in skip], key=lambda f: f.name.lower() ) for md_file in md_files: section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title() doc.add_heading(section_title, level=1) _md_to_doc(doc, md_file.read_text(encoding='utf-8')) doc.add_page_break() doc.save(output) print(f'Saved → {output}') # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser( description='Convert a folder of .md files into a single Word .docx' ) parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)') parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path') parser.add_argument('--title', '-t', default='Project Documentation', help='Document title') parser.add_argument('--logo', '-l', default=None, help='Path to logo image (PNG/JPG) for the title page') args = parser.parse_args() md_dir = Path(args.dir).resolve() if not md_dir.is_dir(): sys.exit(f'Error: {md_dir} is not a directory') logo_path = None if args.logo: logo_path = Path(args.logo).resolve() if not logo_path.is_file(): sys.exit(f'Error: logo file not found: {logo_path}') build_docx(md_dir, Path(args.output), args.title, logo=logo_path) if __name__ == '__main__': main()