From 0eac070dace18eb04b55858f1b9b4548acd64218 Mon Sep 17 00:00:00 2001 From: The_miro Date: Mon, 1 Jun 2026 13:52:03 +0200 Subject: [PATCH] Initial commit: md_to_docx and raw_to_md scripts - md_to_docx.py: converts a directory of .md files into a Word .docx with a title page, auto-updating TOC field, changelog section, and one section per remaining .md file - raw_to_md.py: sends raw text to Claude AI (claude-sonnet-4-6) and returns a structured, chronological Markdown document Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 9 ++ md_to_docx.py | 229 +++++++++++++++++++++++++++++++++++++++++++++++ raw_to_md.py | 112 +++++++++++++++++++++++ requirements.txt | 2 + 4 files changed, 352 insertions(+) create mode 100644 .gitignore create mode 100644 md_to_docx.py create mode 100644 raw_to_md.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4605c96 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +.env +*.docx +venv/ +.venv/ diff --git a/md_to_docx.py b/md_to_docx.py new file mode 100644 index 0000000..907f04c --- /dev/null +++ b/md_to_docx.py @@ -0,0 +1,229 @@ +#!/usr/bin/env python3 +""" +md_to_docx.py — Convert a directory of Markdown files into a single Word document. + +Structure: title page → auto-updating TOC → changelog → one section per .md file. + +Usage: + python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE] +""" + +import argparse +import re +import sys +from datetime import datetime +from pathlib import Path + +from docx import Document +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn +from docx.shared import Pt + + +# --------------------------------------------------------------------------- +# Word XML helpers +# --------------------------------------------------------------------------- + +def _insert_toc_field(doc): + """Insert a Word TOC field (Word updates it on first open).""" + para = doc.add_paragraph() + run = para.add_run() + begin = OxmlElement('w:fldChar') + begin.set(qn('w:fldCharType'), 'begin') + instr = OxmlElement('w:instrText') + instr.set(qn('xml:space'), 'preserve') + instr.text = ' TOC \\o "1-3" \\h \\z \\u ' + sep = OxmlElement('w:fldChar') + sep.set(qn('w:fldCharType'), 'separate') + end = OxmlElement('w:fldChar') + end.set(qn('w:fldCharType'), 'end') + run._r.extend([begin, instr, sep, end]) + + +# --------------------------------------------------------------------------- +# Inline markdown → Word runs +# --------------------------------------------------------------------------- + +_INLINE_RE = re.compile( + r'(\*\*\*[^*\n]+\*\*\*' # bold-italic ***…*** + r'|\*\*[^*\n]+\*\*' # bold **…** + r'|\*[^*\n]+\*' # italic *…* + r'|`[^`\n]+`)' # inline code `…` +) + + +def _add_inline(paragraph, text): + for part in _INLINE_RE.split(text): + if not part: + continue + if part.startswith('***') and part.endswith('***'): + r = paragraph.add_run(part[3:-3]) + r.bold = r.italic = True + elif part.startswith('**') and part.endswith('**'): + r = paragraph.add_run(part[2:-2]) + r.bold = True + elif part.startswith('*') and part.endswith('*'): + r = paragraph.add_run(part[1:-1]) + r.italic = True + elif part.startswith('`') and part.endswith('`'): + r = paragraph.add_run(part[1:-1]) + r.font.name = 'Courier New' + r.font.size = Pt(10) + else: + paragraph.add_run(part) + + +# --------------------------------------------------------------------------- +# Markdown block parser → Word paragraphs +# --------------------------------------------------------------------------- + +def _md_to_doc(doc, content): + lines = content.splitlines() + i = 0 + in_fence = False + fence_buf = [] + + while i < len(lines): + line = lines[i] + + # Fenced code block + if line.startswith('```'): + if not in_fence: + in_fence = True + fence_buf = [] + else: + in_fence = False + p = doc.add_paragraph('\n'.join(fence_buf)) + if p.runs: + p.runs[0].font.name = 'Courier New' + p.runs[0].font.size = Pt(9) + p.paragraph_format.space_after = Pt(6) + i += 1 + continue + + if in_fence: + fence_buf.append(line) + i += 1 + continue + + # ATX headings + m = re.match(r'^(#{1,6})\s+(.*)', line) + if m: + doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6)) + i += 1 + continue + + # Blank line + if not line.strip(): + i += 1 + continue + + # Horizontal rule + if re.match(r'^[*\-_]{3,}\s*$', line): + p = doc.add_paragraph('─' * 60) + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + i += 1 + continue + + # Blockquote + if line.startswith('> '): + p = doc.add_paragraph(style='Quote') + _add_inline(p, line[2:]) + i += 1 + continue + + # Unordered list + m = re.match(r'^\s*[-*+]\s+(.*)', line) + if m: + p = doc.add_paragraph(style='List Bullet') + _add_inline(p, m.group(1)) + i += 1 + continue + + # Ordered list + m = re.match(r'^\s*\d+\.\s+(.*)', line) + if m: + p = doc.add_paragraph(style='List Number') + _add_inline(p, m.group(1)) + i += 1 + continue + + # Default paragraph + p = doc.add_paragraph() + _add_inline(p, line) + i += 1 + + +# --------------------------------------------------------------------------- +# Document assembly +# --------------------------------------------------------------------------- + +def build_docx(md_dir: Path, output: Path, title: str): + doc = Document() + + # Title page + heading = doc.add_heading(title, level=0) + heading.alignment = WD_ALIGN_PARAGRAPH.CENTER + date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}') + date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER + doc.add_page_break() + + # Table of Contents + doc.add_heading('Table of Contents', level=1) + _insert_toc_field(doc) + doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)') + doc.add_page_break() + + # Changelog — always first content section + doc.add_heading('Changelog', level=1) + changelog = next( + (md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md') + if (md_dir / name).exists()), + None + ) + if changelog: + _md_to_doc(doc, changelog.read_text(encoding='utf-8')) + else: + doc.add_paragraph('No changelog file found.') + doc.add_page_break() + + # Remaining .md files sorted alphabetically + skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'} + md_files = sorted( + [f for f in md_dir.glob('*.md') if f.name not in skip], + key=lambda f: f.name.lower() + ) + + for md_file in md_files: + section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title() + doc.add_heading(section_title, level=1) + _md_to_doc(doc, md_file.read_text(encoding='utf-8')) + doc.add_page_break() + + doc.save(output) + print(f'Saved → {output}') + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description='Convert a folder of .md files into a single Word .docx' + ) + parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)') + parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path') + parser.add_argument('--title', '-t', default='Project Documentation', help='Document title') + args = parser.parse_args() + + md_dir = Path(args.dir).resolve() + if not md_dir.is_dir(): + sys.exit(f'Error: {md_dir} is not a directory') + + build_docx(md_dir, Path(args.output), args.title) + + +if __name__ == '__main__': + main() diff --git a/raw_to_md.py b/raw_to_md.py new file mode 100644 index 0000000..ebdb90a --- /dev/null +++ b/raw_to_md.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +""" +raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file +using Claude AI. + +Reads from --text, --input file, or stdin. Writes to --output or stdout. + +Usage: + python raw_to_md.py --text "raw notes here" --output structured.md + python raw_to_md.py --input dump.txt --output structured.md + cat dump.txt | python raw_to_md.py --output structured.md + +Requires: + ANTHROPIC_API_KEY environment variable +""" + +import argparse +import os +import sys + +import anthropic + +SYSTEM_PROMPT = """\ +You are a technical documentation writer. Your task is to convert raw, unstructured text \ +into a clean, well-structured Markdown document. + +Guidelines: +- Add a ## Summary section at the very top with a 2-3 sentence overview. +- Organise the rest chronologically when dates, versions, or sequence cues are present; \ + otherwise group by logical topic. +- Use ## for top-level sections, ### for subsections. +- Use bullet lists for discrete items or steps. +- Use **bold** for key terms, dates, names, and version numbers. +- Use `code spans` for commands, file names, and technical strings. +- Use fenced code blocks (``` … ```) for multi-line code or config samples. +- Do NOT invent or embellish information that is not in the input. +- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\ +""" + + +def convert(raw_text: str, model: str) -> str: + client = anthropic.Anthropic() + response = client.messages.create( + model=model, + max_tokens=8192, + system=SYSTEM_PROMPT, + messages=[ + { + 'role': 'user', + 'content': ( + 'Convert the following raw text into a structured, ' + 'chronological Markdown document:\n\n' + + raw_text + ), + } + ], + ) + return response.content[0].text + + +def main(): + parser = argparse.ArgumentParser( + description='Convert raw text to structured Markdown via Claude AI' + ) + src = parser.add_mutually_exclusive_group() + src.add_argument('--text', '-t', help='Raw text passed directly as a string') + src.add_argument('--input', '-i', help='Path to a plain-text input file') + parser.add_argument('--output', '-o', help='Output .md file (default: stdout)') + parser.add_argument( + '--model', '-m', + default='claude-sonnet-4-6', + help='Claude model ID (default: claude-sonnet-4-6)', + ) + args = parser.parse_args() + + if not os.environ.get('ANTHROPIC_API_KEY'): + sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.') + + # Resolve input + if args.text: + raw = args.text + elif args.input: + path = args.input + try: + with open(path, encoding='utf-8') as f: + raw = f.read() + except OSError as e: + sys.exit(f'Error reading {path}: {e}') + else: + if sys.stdin.isatty(): + print('Paste raw text then press Ctrl+D:', file=sys.stderr) + raw = sys.stdin.read() + + if not raw.strip(): + sys.exit('Error: input is empty.') + + print(f'Sending to {args.model}…', file=sys.stderr) + result = convert(raw, args.model) + + if args.output: + try: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(result) + print(f'Saved → {args.output}', file=sys.stderr) + except OSError as e: + sys.exit(f'Error writing {args.output}: {e}') + else: + print(result) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c7bdf50 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +python-docx>=1.1.0 +anthropic>=0.30.0