doc-tools/raw_to_md.py

113 lines
3.5 KiB
Python

#!/usr/bin/env python3
"""
raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
using Claude AI.
Reads from --text, --input file, or stdin. Writes to --output or stdout.
Usage:
python raw_to_md.py --text "raw notes here" --output structured.md
python raw_to_md.py --input dump.txt --output structured.md
cat dump.txt | python raw_to_md.py --output structured.md
Requires:
ANTHROPIC_API_KEY environment variable
"""
import argparse
import os
import sys
import anthropic
SYSTEM_PROMPT = """\
You are a technical documentation writer. Your task is to convert raw, unstructured text \
into a clean, well-structured Markdown document.
Guidelines:
- Add a ## Summary section at the very top with a 2-3 sentence overview.
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
otherwise group by logical topic.
- Use ## for top-level sections, ### for subsections.
- Use bullet lists for discrete items or steps.
- Use **bold** for key terms, dates, names, and version numbers.
- Use `code spans` for commands, file names, and technical strings.
- Use fenced code blocks (``` … ```) for multi-line code or config samples.
- Do NOT invent or embellish information that is not in the input.
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
"""
def convert(raw_text: str, model: str) -> str:
client = anthropic.Anthropic()
response = client.messages.create(
model=model,
max_tokens=8192,
system=SYSTEM_PROMPT,
messages=[
{
'role': 'user',
'content': (
'Convert the following raw text into a structured, '
'chronological Markdown document:\n\n'
+ raw_text
),
}
],
)
return response.content[0].text
def main():
parser = argparse.ArgumentParser(
description='Convert raw text to structured Markdown via Claude AI'
)
src = parser.add_mutually_exclusive_group()
src.add_argument('--text', '-t', help='Raw text passed directly as a string')
src.add_argument('--input', '-i', help='Path to a plain-text input file')
parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
parser.add_argument(
'--model', '-m',
default='claude-sonnet-4-6',
help='Claude model ID (default: claude-sonnet-4-6)',
)
args = parser.parse_args()
if not os.environ.get('ANTHROPIC_API_KEY'):
sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
# Resolve input
if args.text:
raw = args.text
elif args.input:
path = args.input
try:
with open(path, encoding='utf-8') as f:
raw = f.read()
except OSError as e:
sys.exit(f'Error reading {path}: {e}')
else:
if sys.stdin.isatty():
print('Paste raw text then press Ctrl+D:', file=sys.stderr)
raw = sys.stdin.read()
if not raw.strip():
sys.exit('Error: input is empty.')
print(f'Sending to {args.model}', file=sys.stderr)
result = convert(raw, args.model)
if args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(result)
print(f'Saved → {args.output}', file=sys.stderr)
except OSError as e:
sys.exit(f'Error writing {args.output}: {e}')
else:
print(result)
if __name__ == '__main__':
main()