Initial commit: md_to_docx and raw_to_md scripts

- md_to_docx.py: converts a directory of .md files into a Word .docx with a title page, auto-updating TOC field, changelog section, and one section per remaining .md file - raw_to_md.py: sends raw text to Claude AI (claude-sonnet-4-6) and returns a structured, chronological Markdown document Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-01 13:52:03 +02:00 · 2026-06-01 13:52:03 +02:00 · 0eac070dac
commit 0eac070dac
4 changed files with 352 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,9 @@
 __pycache__/
 *.py[cod]
 *.egg-info/
 dist/
 build/
 .env
 *.docx
 venv/
 .venv/
--- a/md_to_docx.py
+++ b/md_to_docx.py
@ -0,0 +1,229 @@
 #!/usr/bin/env python3
 """
 md_to_docx.py — Convert a directory of Markdown files into a single Word document.
 Structure: title page → auto-updating TOC → changelog → one section per .md file.
 Usage:
    python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE]
 """
 import argparse
 import re
 import sys
 from datetime import datetime
 from pathlib import Path
 from docx import Document
 from docx.enum.text import WD_ALIGN_PARAGRAPH
 from docx.oxml import OxmlElement
 from docx.oxml.ns import qn
 from docx.shared import Pt
 # ---------------------------------------------------------------------------
 # Word XML helpers
 # ---------------------------------------------------------------------------
 def _insert_toc_field(doc):
    """Insert a Word TOC field (Word updates it on first open)."""
    para = doc.add_paragraph()
    run = para.add_run()
    begin = OxmlElement('w:fldChar')
    begin.set(qn('w:fldCharType'), 'begin')
    instr = OxmlElement('w:instrText')
    instr.set(qn('xml:space'), 'preserve')
    instr.text = ' TOC \\o "1-3" \\h \\z \\u '
    sep = OxmlElement('w:fldChar')
    sep.set(qn('w:fldCharType'), 'separate')
    end = OxmlElement('w:fldChar')
    end.set(qn('w:fldCharType'), 'end')
    run._r.extend([begin, instr, sep, end])
 # ---------------------------------------------------------------------------
 # Inline markdown → Word runs
 # ---------------------------------------------------------------------------
 _INLINE_RE = re.compile(
    r'(\*\*\*[^*\n]+\*\*\*'   # bold-italic ***…***
    r'|\*\*[^*\n]+\*\*'        # bold       **…**
    r'|\*[^*\n]+\*'            # italic      *…*
    r'|`[^`\n]+`)'             # inline code `…`
 )
 def _add_inline(paragraph, text):
    for part in _INLINE_RE.split(text):
        if not part:
            continue
        if part.startswith('***') and part.endswith('***'):
            r = paragraph.add_run(part[3:-3])
            r.bold = r.italic = True
        elif part.startswith('**') and part.endswith('**'):
            r = paragraph.add_run(part[2:-2])
            r.bold = True
        elif part.startswith('*') and part.endswith('*'):
            r = paragraph.add_run(part[1:-1])
            r.italic = True
        elif part.startswith('`') and part.endswith('`'):
            r = paragraph.add_run(part[1:-1])
            r.font.name = 'Courier New'
            r.font.size = Pt(10)
        else:
            paragraph.add_run(part)
 # ---------------------------------------------------------------------------
 # Markdown block parser → Word paragraphs
 # ---------------------------------------------------------------------------
 def _md_to_doc(doc, content):
    lines = content.splitlines()
    i = 0
    in_fence = False
    fence_buf = []
    while i < len(lines):
        line = lines[i]
        # Fenced code block
        if line.startswith('```'):
            if not in_fence:
                in_fence = True
                fence_buf = []
            else:
                in_fence = False
                p = doc.add_paragraph('\n'.join(fence_buf))
                if p.runs:
                    p.runs[0].font.name = 'Courier New'
                    p.runs[0].font.size = Pt(9)
                p.paragraph_format.space_after = Pt(6)
            i += 1
            continue
        if in_fence:
            fence_buf.append(line)
            i += 1
            continue
        # ATX headings
        m = re.match(r'^(#{1,6})\s+(.*)', line)
        if m:
            doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6))
            i += 1
            continue
        # Blank line
        if not line.strip():
            i += 1
            continue
        # Horizontal rule
        if re.match(r'^[*\-_]{3,}\s*$', line):
            p = doc.add_paragraph('─' * 60)
            p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            i += 1
            continue
        # Blockquote
        if line.startswith('> '):
            p = doc.add_paragraph(style='Quote')
            _add_inline(p, line[2:])
            i += 1
            continue
        # Unordered list
        m = re.match(r'^\s*[-*+]\s+(.*)', line)
        if m:
            p = doc.add_paragraph(style='List Bullet')
            _add_inline(p, m.group(1))
            i += 1
            continue
        # Ordered list
        m = re.match(r'^\s*\d+\.\s+(.*)', line)
        if m:
            p = doc.add_paragraph(style='List Number')
            _add_inline(p, m.group(1))
            i += 1
            continue
        # Default paragraph
        p = doc.add_paragraph()
        _add_inline(p, line)
        i += 1
 # ---------------------------------------------------------------------------
 # Document assembly
 # ---------------------------------------------------------------------------
 def build_docx(md_dir: Path, output: Path, title: str):
    doc = Document()
    # Title page
    heading = doc.add_heading(title, level=0)
    heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
    date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}')
    date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    doc.add_page_break()
    # Table of Contents
    doc.add_heading('Table of Contents', level=1)
    _insert_toc_field(doc)
    doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)')
    doc.add_page_break()
    # Changelog — always first content section
    doc.add_heading('Changelog', level=1)
    changelog = next(
        (md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md')
         if (md_dir / name).exists()),
        None
    )
    if changelog:
        _md_to_doc(doc, changelog.read_text(encoding='utf-8'))
    else:
        doc.add_paragraph('No changelog file found.')
    doc.add_page_break()
    # Remaining .md files sorted alphabetically
    skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'}
    md_files = sorted(
        [f for f in md_dir.glob('*.md') if f.name not in skip],
        key=lambda f: f.name.lower()
    )
    for md_file in md_files:
        section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title()
        doc.add_heading(section_title, level=1)
        _md_to_doc(doc, md_file.read_text(encoding='utf-8'))
        doc.add_page_break()
    doc.save(output)
    print(f'Saved → {output}')
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
 def main():
    parser = argparse.ArgumentParser(
        description='Convert a folder of .md files into a single Word .docx'
    )
    parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)')
    parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path')
    parser.add_argument('--title', '-t', default='Project Documentation', help='Document title')
    args = parser.parse_args()
    md_dir = Path(args.dir).resolve()
    if not md_dir.is_dir():
        sys.exit(f'Error: {md_dir} is not a directory')
    build_docx(md_dir, Path(args.output), args.title)
 if __name__ == '__main__':
    main()
--- a/raw_to_md.py
+++ b/raw_to_md.py
@ -0,0 +1,112 @@
 #!/usr/bin/env python3
 """
 raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
 using Claude AI.
 Reads from --text, --input file, or stdin. Writes to --output or stdout.
 Usage:
    python raw_to_md.py --text "raw notes here" --output structured.md
    python raw_to_md.py --input dump.txt --output structured.md
    cat dump.txt | python raw_to_md.py --output structured.md
 Requires:
    ANTHROPIC_API_KEY environment variable
 """
 import argparse
 import os
 import sys
 import anthropic
 SYSTEM_PROMPT = """\
 You are a technical documentation writer. Your task is to convert raw, unstructured text \
 into a clean, well-structured Markdown document.
 Guidelines:
 - Add a ## Summary section at the very top with a 2-3 sentence overview.
 - Organise the rest chronologically when dates, versions, or sequence cues are present; \
  otherwise group by logical topic.
 - Use ## for top-level sections, ### for subsections.
 - Use bullet lists for discrete items or steps.
 - Use **bold** for key terms, dates, names, and version numbers.
 - Use `code spans` for commands, file names, and technical strings.
 - Use fenced code blocks (``` … ```) for multi-line code or config samples.
 - Do NOT invent or embellish information that is not in the input.
 - Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
 """
 def convert(raw_text: str, model: str) -> str:
    client = anthropic.Anthropic()
    response = client.messages.create(
        model=model,
        max_tokens=8192,
        system=SYSTEM_PROMPT,
        messages=[
            {
                'role': 'user',
                'content': (
                    'Convert the following raw text into a structured, '
                    'chronological Markdown document:\n\n'
                    + raw_text
                ),
            }
        ],
    )
    return response.content[0].text
 def main():
    parser = argparse.ArgumentParser(
        description='Convert raw text to structured Markdown via Claude AI'
    )
    src = parser.add_mutually_exclusive_group()
    src.add_argument('--text', '-t', help='Raw text passed directly as a string')
    src.add_argument('--input', '-i', help='Path to a plain-text input file')
    parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
    parser.add_argument(
        '--model', '-m',
        default='claude-sonnet-4-6',
        help='Claude model ID (default: claude-sonnet-4-6)',
    )
    args = parser.parse_args()
    if not os.environ.get('ANTHROPIC_API_KEY'):
        sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
    # Resolve input
    if args.text:
        raw = args.text
    elif args.input:
        path = args.input
        try:
            with open(path, encoding='utf-8') as f:
                raw = f.read()
        except OSError as e:
            sys.exit(f'Error reading {path}: {e}')
    else:
        if sys.stdin.isatty():
            print('Paste raw text then press Ctrl+D:', file=sys.stderr)
        raw = sys.stdin.read()
    if not raw.strip():
        sys.exit('Error: input is empty.')
    print(f'Sending to {args.model}…', file=sys.stderr)
    result = convert(raw, args.model)
    if args.output:
        try:
            with open(args.output, 'w', encoding='utf-8') as f:
                f.write(result)
            print(f'Saved → {args.output}', file=sys.stderr)
        except OSError as e:
            sys.exit(f'Error writing {args.output}: {e}')
    else:
        print(result)
 if __name__ == '__main__':
    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
 python-docx>=1.1.0
 anthropic>=0.30.0