Initial commit: md_to_docx and raw_to_md scripts

- md_to_docx.py: converts a directory of .md files into a Word .docx with a title page, auto-updating TOC field, changelog section, and one section per remaining .md file - raw_to_md.py: sends raw text to Claude AI (claude-sonnet-4-6) and returns a structured, chronological Markdown document Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-01 13:52:03 +02:00 · 2026-06-01 13:52:03 +02:00 · 0eac070dac
commit 0eac070dac
4 changed files with 352 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,9 @@
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+.env
+*.docx
+venv/
+.venv/
--- a/md_to_docx.py
+++ b/md_to_docx.py
@ -0,0 +1,229 @@
+#!/usr/bin/env python3
+"""
+md_to_docx.py — Convert a directory of Markdown files into a single Word document.
+
+Structure: title page → auto-updating TOC → changelog → one section per .md file.
+
+Usage:
+    python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE]
+"""
+
+import argparse
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from docx import Document
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+from docx.shared import Pt
+
+
+# ---------------------------------------------------------------------------
+# Word XML helpers
+# ---------------------------------------------------------------------------
+
+def _insert_toc_field(doc):
+    """Insert a Word TOC field (Word updates it on first open)."""
+    para = doc.add_paragraph()
+    run = para.add_run()
+    begin = OxmlElement('w:fldChar')
+    begin.set(qn('w:fldCharType'), 'begin')
+    instr = OxmlElement('w:instrText')
+    instr.set(qn('xml:space'), 'preserve')
+    instr.text = ' TOC \\o "1-3" \\h \\z \\u '
+    sep = OxmlElement('w:fldChar')
+    sep.set(qn('w:fldCharType'), 'separate')
+    end = OxmlElement('w:fldChar')
+    end.set(qn('w:fldCharType'), 'end')
+    run._r.extend([begin, instr, sep, end])
+
+
+# ---------------------------------------------------------------------------
+# Inline markdown → Word runs
+# ---------------------------------------------------------------------------
+
+_INLINE_RE = re.compile(
+    r'(\*\*\*[^*\n]+\*\*\*'   # bold-italic ***…***
+    r'|\*\*[^*\n]+\*\*'        # bold       **…**
+    r'|\*[^*\n]+\*'            # italic      *…*
+    r'|`[^`\n]+`)'             # inline code `…`
+)
+
+
+def _add_inline(paragraph, text):
+    for part in _INLINE_RE.split(text):
+        if not part:
+            continue
+        if part.startswith('***') and part.endswith('***'):
+            r = paragraph.add_run(part[3:-3])
+            r.bold = r.italic = True
+        elif part.startswith('**') and part.endswith('**'):
+            r = paragraph.add_run(part[2:-2])
+            r.bold = True
+        elif part.startswith('*') and part.endswith('*'):
+            r = paragraph.add_run(part[1:-1])
+            r.italic = True
+        elif part.startswith('`') and part.endswith('`'):
+            r = paragraph.add_run(part[1:-1])
+            r.font.name = 'Courier New'
+            r.font.size = Pt(10)
+        else:
+            paragraph.add_run(part)
+
+
+# ---------------------------------------------------------------------------
+# Markdown block parser → Word paragraphs
+# ---------------------------------------------------------------------------
+
+def _md_to_doc(doc, content):
+    lines = content.splitlines()
+    i = 0
+    in_fence = False
+    fence_buf = []
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Fenced code block
+        if line.startswith('```'):
+            if not in_fence:
+                in_fence = True
+                fence_buf = []
+            else:
+                in_fence = False
+                p = doc.add_paragraph('\n'.join(fence_buf))
+                if p.runs:
+                    p.runs[0].font.name = 'Courier New'
+                    p.runs[0].font.size = Pt(9)
+                p.paragraph_format.space_after = Pt(6)
+            i += 1
+            continue
+
+        if in_fence:
+            fence_buf.append(line)
+            i += 1
+            continue
+
+        # ATX headings
+        m = re.match(r'^(#{1,6})\s+(.*)', line)
+        if m:
+            doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6))
+            i += 1
+            continue
+
+        # Blank line
+        if not line.strip():
+            i += 1
+            continue
+
+        # Horizontal rule
+        if re.match(r'^[*\-_]{3,}\s*$', line):
+            p = doc.add_paragraph('─' * 60)
+            p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            i += 1
+            continue
+
+        # Blockquote
+        if line.startswith('> '):
+            p = doc.add_paragraph(style='Quote')
+            _add_inline(p, line[2:])
+            i += 1
+            continue
+
+        # Unordered list
+        m = re.match(r'^\s*[-*+]\s+(.*)', line)
+        if m:
+            p = doc.add_paragraph(style='List Bullet')
+            _add_inline(p, m.group(1))
+            i += 1
+            continue
+
+        # Ordered list
+        m = re.match(r'^\s*\d+\.\s+(.*)', line)
+        if m:
+            p = doc.add_paragraph(style='List Number')
+            _add_inline(p, m.group(1))
+            i += 1
+            continue
+
+        # Default paragraph
+        p = doc.add_paragraph()
+        _add_inline(p, line)
+        i += 1
+
+
+# ---------------------------------------------------------------------------
+# Document assembly
+# ---------------------------------------------------------------------------
+
+def build_docx(md_dir: Path, output: Path, title: str):
+    doc = Document()
+
+    # Title page
+    heading = doc.add_heading(title, level=0)
+    heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}')
+    date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    doc.add_page_break()
+
+    # Table of Contents
+    doc.add_heading('Table of Contents', level=1)
+    _insert_toc_field(doc)
+    doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)')
+    doc.add_page_break()
+
+    # Changelog — always first content section
+    doc.add_heading('Changelog', level=1)
+    changelog = next(
+        (md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md')
+         if (md_dir / name).exists()),
+        None
+    )
+    if changelog:
+        _md_to_doc(doc, changelog.read_text(encoding='utf-8'))
+    else:
+        doc.add_paragraph('No changelog file found.')
+    doc.add_page_break()
+
+    # Remaining .md files sorted alphabetically
+    skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'}
+    md_files = sorted(
+        [f for f in md_dir.glob('*.md') if f.name not in skip],
+        key=lambda f: f.name.lower()
+    )
+
+    for md_file in md_files:
+        section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title()
+        doc.add_heading(section_title, level=1)
+        _md_to_doc(doc, md_file.read_text(encoding='utf-8'))
+        doc.add_page_break()
+
+    doc.save(output)
+    print(f'Saved → {output}')
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert a folder of .md files into a single Word .docx'
+    )
+    parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)')
+    parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path')
+    parser.add_argument('--title', '-t', default='Project Documentation', help='Document title')
+    args = parser.parse_args()
+
+    md_dir = Path(args.dir).resolve()
+    if not md_dir.is_dir():
+        sys.exit(f'Error: {md_dir} is not a directory')
+
+    build_docx(md_dir, Path(args.output), args.title)
+
+
+if __name__ == '__main__':
+    main()
--- a/raw_to_md.py
+++ b/raw_to_md.py
@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+"""
+raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
+using Claude AI.
+
+Reads from --text, --input file, or stdin. Writes to --output or stdout.
+
+Usage:
+    python raw_to_md.py --text "raw notes here" --output structured.md
+    python raw_to_md.py --input dump.txt --output structured.md
+    cat dump.txt | python raw_to_md.py --output structured.md
+
+Requires:
+    ANTHROPIC_API_KEY environment variable
+"""
+
+import argparse
+import os
+import sys
+
+import anthropic
+
+SYSTEM_PROMPT = """\
+You are a technical documentation writer. Your task is to convert raw, unstructured text \
+into a clean, well-structured Markdown document.
+
+Guidelines:
+- Add a ## Summary section at the very top with a 2-3 sentence overview.
+- Organise the rest chronologically when dates, versions, or sequence cues are present; \
+  otherwise group by logical topic.
+- Use ## for top-level sections, ### for subsections.
+- Use bullet lists for discrete items or steps.
+- Use **bold** for key terms, dates, names, and version numbers.
+- Use `code spans` for commands, file names, and technical strings.
+- Use fenced code blocks (``` … ```) for multi-line code or config samples.
+- Do NOT invent or embellish information that is not in the input.
+- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
+"""
+
+
+def convert(raw_text: str, model: str) -> str:
+    client = anthropic.Anthropic()
+    response = client.messages.create(
+        model=model,
+        max_tokens=8192,
+        system=SYSTEM_PROMPT,
+        messages=[
+            {
+                'role': 'user',
+                'content': (
+                    'Convert the following raw text into a structured, '
+                    'chronological Markdown document:\n\n'
+                    + raw_text
+                ),
+            }
+        ],
+    )
+    return response.content[0].text
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert raw text to structured Markdown via Claude AI'
+    )
+    src = parser.add_mutually_exclusive_group()
+    src.add_argument('--text', '-t', help='Raw text passed directly as a string')
+    src.add_argument('--input', '-i', help='Path to a plain-text input file')
+    parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
+    parser.add_argument(
+        '--model', '-m',
+        default='claude-sonnet-4-6',
+        help='Claude model ID (default: claude-sonnet-4-6)',
+    )
+    args = parser.parse_args()
+
+    if not os.environ.get('ANTHROPIC_API_KEY'):
+        sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
+
+    # Resolve input
+    if args.text:
+        raw = args.text
+    elif args.input:
+        path = args.input
+        try:
+            with open(path, encoding='utf-8') as f:
+                raw = f.read()
+        except OSError as e:
+            sys.exit(f'Error reading {path}: {e}')
+    else:
+        if sys.stdin.isatty():
+            print('Paste raw text then press Ctrl+D:', file=sys.stderr)
+        raw = sys.stdin.read()
+
+    if not raw.strip():
+        sys.exit('Error: input is empty.')
+
+    print(f'Sending to {args.model}…', file=sys.stderr)
+    result = convert(raw, args.model)
+
+    if args.output:
+        try:
+            with open(args.output, 'w', encoding='utf-8') as f:
+                f.write(result)
+            print(f'Saved → {args.output}', file=sys.stderr)
+        except OSError as e:
+            sys.exit(f'Error writing {args.output}: {e}')
+    else:
+        print(result)
+
+
+if __name__ == '__main__':
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,2 @@
+python-docx>=1.1.0
+anthropic>=0.30.0