Initial commit: md_to_docx and raw_to_md scripts
- md_to_docx.py: converts a directory of .md files into a Word .docx with a title page, auto-updating TOC field, changelog section, and one section per remaining .md file - raw_to_md.py: sends raw text to Claude AI (claude-sonnet-4-6) and returns a structured, chronological Markdown document Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>main
commit
0eac070dac
|
|
@ -0,0 +1,9 @@
|
|||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.env
|
||||
*.docx
|
||||
venv/
|
||||
.venv/
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
md_to_docx.py — Convert a directory of Markdown files into a single Word document.
|
||||
|
||||
Structure: title page → auto-updating TOC → changelog → one section per .md file.
|
||||
|
||||
Usage:
|
||||
python md_to_docx.py [--dir DIR] [--output OUTPUT] [--title TITLE]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from docx import Document
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Pt
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Word XML helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _insert_toc_field(doc):
|
||||
"""Insert a Word TOC field (Word updates it on first open)."""
|
||||
para = doc.add_paragraph()
|
||||
run = para.add_run()
|
||||
begin = OxmlElement('w:fldChar')
|
||||
begin.set(qn('w:fldCharType'), 'begin')
|
||||
instr = OxmlElement('w:instrText')
|
||||
instr.set(qn('xml:space'), 'preserve')
|
||||
instr.text = ' TOC \\o "1-3" \\h \\z \\u '
|
||||
sep = OxmlElement('w:fldChar')
|
||||
sep.set(qn('w:fldCharType'), 'separate')
|
||||
end = OxmlElement('w:fldChar')
|
||||
end.set(qn('w:fldCharType'), 'end')
|
||||
run._r.extend([begin, instr, sep, end])
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Inline markdown → Word runs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_INLINE_RE = re.compile(
|
||||
r'(\*\*\*[^*\n]+\*\*\*' # bold-italic ***…***
|
||||
r'|\*\*[^*\n]+\*\*' # bold **…**
|
||||
r'|\*[^*\n]+\*' # italic *…*
|
||||
r'|`[^`\n]+`)' # inline code `…`
|
||||
)
|
||||
|
||||
|
||||
def _add_inline(paragraph, text):
|
||||
for part in _INLINE_RE.split(text):
|
||||
if not part:
|
||||
continue
|
||||
if part.startswith('***') and part.endswith('***'):
|
||||
r = paragraph.add_run(part[3:-3])
|
||||
r.bold = r.italic = True
|
||||
elif part.startswith('**') and part.endswith('**'):
|
||||
r = paragraph.add_run(part[2:-2])
|
||||
r.bold = True
|
||||
elif part.startswith('*') and part.endswith('*'):
|
||||
r = paragraph.add_run(part[1:-1])
|
||||
r.italic = True
|
||||
elif part.startswith('`') and part.endswith('`'):
|
||||
r = paragraph.add_run(part[1:-1])
|
||||
r.font.name = 'Courier New'
|
||||
r.font.size = Pt(10)
|
||||
else:
|
||||
paragraph.add_run(part)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Markdown block parser → Word paragraphs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _md_to_doc(doc, content):
|
||||
lines = content.splitlines()
|
||||
i = 0
|
||||
in_fence = False
|
||||
fence_buf = []
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
|
||||
# Fenced code block
|
||||
if line.startswith('```'):
|
||||
if not in_fence:
|
||||
in_fence = True
|
||||
fence_buf = []
|
||||
else:
|
||||
in_fence = False
|
||||
p = doc.add_paragraph('\n'.join(fence_buf))
|
||||
if p.runs:
|
||||
p.runs[0].font.name = 'Courier New'
|
||||
p.runs[0].font.size = Pt(9)
|
||||
p.paragraph_format.space_after = Pt(6)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if in_fence:
|
||||
fence_buf.append(line)
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# ATX headings
|
||||
m = re.match(r'^(#{1,6})\s+(.*)', line)
|
||||
if m:
|
||||
doc.add_heading(m.group(2).strip(), level=min(len(m.group(1)), 6))
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Blank line
|
||||
if not line.strip():
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Horizontal rule
|
||||
if re.match(r'^[*\-_]{3,}\s*$', line):
|
||||
p = doc.add_paragraph('─' * 60)
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Blockquote
|
||||
if line.startswith('> '):
|
||||
p = doc.add_paragraph(style='Quote')
|
||||
_add_inline(p, line[2:])
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Unordered list
|
||||
m = re.match(r'^\s*[-*+]\s+(.*)', line)
|
||||
if m:
|
||||
p = doc.add_paragraph(style='List Bullet')
|
||||
_add_inline(p, m.group(1))
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Ordered list
|
||||
m = re.match(r'^\s*\d+\.\s+(.*)', line)
|
||||
if m:
|
||||
p = doc.add_paragraph(style='List Number')
|
||||
_add_inline(p, m.group(1))
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Default paragraph
|
||||
p = doc.add_paragraph()
|
||||
_add_inline(p, line)
|
||||
i += 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document assembly
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_docx(md_dir: Path, output: Path, title: str):
|
||||
doc = Document()
|
||||
|
||||
# Title page
|
||||
heading = doc.add_heading(title, level=0)
|
||||
heading.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
date_para = doc.add_paragraph(f'Generated: {datetime.now().strftime("%Y-%m-%d")}')
|
||||
date_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
doc.add_page_break()
|
||||
|
||||
# Table of Contents
|
||||
doc.add_heading('Table of Contents', level=1)
|
||||
_insert_toc_field(doc)
|
||||
doc.add_paragraph('(Open in Word and press F9 or Ctrl+A → F9 to update this field.)')
|
||||
doc.add_page_break()
|
||||
|
||||
# Changelog — always first content section
|
||||
doc.add_heading('Changelog', level=1)
|
||||
changelog = next(
|
||||
(md_dir / name for name in ('CHANGELOG.md', 'changelog.md', 'CHANGES.md')
|
||||
if (md_dir / name).exists()),
|
||||
None
|
||||
)
|
||||
if changelog:
|
||||
_md_to_doc(doc, changelog.read_text(encoding='utf-8'))
|
||||
else:
|
||||
doc.add_paragraph('No changelog file found.')
|
||||
doc.add_page_break()
|
||||
|
||||
# Remaining .md files sorted alphabetically
|
||||
skip = {'CHANGELOG.md', 'changelog.md', 'CHANGES.md'}
|
||||
md_files = sorted(
|
||||
[f for f in md_dir.glob('*.md') if f.name not in skip],
|
||||
key=lambda f: f.name.lower()
|
||||
)
|
||||
|
||||
for md_file in md_files:
|
||||
section_title = md_file.stem.replace('-', ' ').replace('_', ' ').title()
|
||||
doc.add_heading(section_title, level=1)
|
||||
_md_to_doc(doc, md_file.read_text(encoding='utf-8'))
|
||||
doc.add_page_break()
|
||||
|
||||
doc.save(output)
|
||||
print(f'Saved → {output}')
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert a folder of .md files into a single Word .docx'
|
||||
)
|
||||
parser.add_argument('--dir', '-d', default='.', help='Directory containing .md files (default: .)')
|
||||
parser.add_argument('--output', '-o', default='documentation.docx', help='Output .docx path')
|
||||
parser.add_argument('--title', '-t', default='Project Documentation', help='Document title')
|
||||
args = parser.parse_args()
|
||||
|
||||
md_dir = Path(args.dir).resolve()
|
||||
if not md_dir.is_dir():
|
||||
sys.exit(f'Error: {md_dir} is not a directory')
|
||||
|
||||
build_docx(md_dir, Path(args.output), args.title)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
|
||||
using Claude AI.
|
||||
|
||||
Reads from --text, --input file, or stdin. Writes to --output or stdout.
|
||||
|
||||
Usage:
|
||||
python raw_to_md.py --text "raw notes here" --output structured.md
|
||||
python raw_to_md.py --input dump.txt --output structured.md
|
||||
cat dump.txt | python raw_to_md.py --output structured.md
|
||||
|
||||
Requires:
|
||||
ANTHROPIC_API_KEY environment variable
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
|
||||
import anthropic
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
You are a technical documentation writer. Your task is to convert raw, unstructured text \
|
||||
into a clean, well-structured Markdown document.
|
||||
|
||||
Guidelines:
|
||||
- Add a ## Summary section at the very top with a 2-3 sentence overview.
|
||||
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
|
||||
otherwise group by logical topic.
|
||||
- Use ## for top-level sections, ### for subsections.
|
||||
- Use bullet lists for discrete items or steps.
|
||||
- Use **bold** for key terms, dates, names, and version numbers.
|
||||
- Use `code spans` for commands, file names, and technical strings.
|
||||
- Use fenced code blocks (``` … ```) for multi-line code or config samples.
|
||||
- Do NOT invent or embellish information that is not in the input.
|
||||
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
|
||||
"""
|
||||
|
||||
|
||||
def convert(raw_text: str, model: str) -> str:
|
||||
client = anthropic.Anthropic()
|
||||
response = client.messages.create(
|
||||
model=model,
|
||||
max_tokens=8192,
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[
|
||||
{
|
||||
'role': 'user',
|
||||
'content': (
|
||||
'Convert the following raw text into a structured, '
|
||||
'chronological Markdown document:\n\n'
|
||||
+ raw_text
|
||||
),
|
||||
}
|
||||
],
|
||||
)
|
||||
return response.content[0].text
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Convert raw text to structured Markdown via Claude AI'
|
||||
)
|
||||
src = parser.add_mutually_exclusive_group()
|
||||
src.add_argument('--text', '-t', help='Raw text passed directly as a string')
|
||||
src.add_argument('--input', '-i', help='Path to a plain-text input file')
|
||||
parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
|
||||
parser.add_argument(
|
||||
'--model', '-m',
|
||||
default='claude-sonnet-4-6',
|
||||
help='Claude model ID (default: claude-sonnet-4-6)',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.environ.get('ANTHROPIC_API_KEY'):
|
||||
sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
|
||||
|
||||
# Resolve input
|
||||
if args.text:
|
||||
raw = args.text
|
||||
elif args.input:
|
||||
path = args.input
|
||||
try:
|
||||
with open(path, encoding='utf-8') as f:
|
||||
raw = f.read()
|
||||
except OSError as e:
|
||||
sys.exit(f'Error reading {path}: {e}')
|
||||
else:
|
||||
if sys.stdin.isatty():
|
||||
print('Paste raw text then press Ctrl+D:', file=sys.stderr)
|
||||
raw = sys.stdin.read()
|
||||
|
||||
if not raw.strip():
|
||||
sys.exit('Error: input is empty.')
|
||||
|
||||
print(f'Sending to {args.model}…', file=sys.stderr)
|
||||
result = convert(raw, args.model)
|
||||
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'w', encoding='utf-8') as f:
|
||||
f.write(result)
|
||||
print(f'Saved → {args.output}', file=sys.stderr)
|
||||
except OSError as e:
|
||||
sys.exit(f'Error writing {args.output}: {e}')
|
||||
else:
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
python-docx>=1.1.0
|
||||
anthropic>=0.30.0
|
||||
Loading…
Reference in New Issue