doc-tools/raw_to_md.py

135 lines
4.4 KiB
Python

#!/usr/bin/env python3
"""
raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
using Claude AI.
Reads from --text, --input file, or stdin. Writes to --output or stdout.
Usage:
python raw_to_md.py --text "raw notes here" --output structured.md
python raw_to_md.py --input dump.txt --output structured.md
cat dump.txt | python raw_to_md.py --output structured.md
Requires:
ANTHROPIC_API_KEY environment variable
"""
import argparse
import os
import sys
import anthropic
_BASE_GUIDELINES = """\
- Add a ## Summary section at the very top with a 2-3 sentence overview.
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
otherwise group by logical topic.
- Use ## for top-level sections, ### for subsections.
- Use bullet lists for discrete items or steps.
- Use **bold** for key terms, dates, names, and version numbers.
- Use `code spans` for commands, file names, and technical strings.
- Use fenced code blocks (``` … ```) for multi-line code or config samples.
- Do NOT invent or embellish information that is not in the input.
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
"""
_FILTER_GENERIC = (
"- Silently discard any passages that are clearly noise, off-topic, or unrelated "
"to the main subject of the input (e.g. unrelated chat messages, ads, system logs "
"from other contexts). Do not mention the omission."
)
_FILTER_TOPIC = (
"- This document is about: {topic}. "
"Silently discard any passages that are clearly unrelated to this topic. "
"Do not mention the omission."
)
def _build_system_prompt(topic: str | None) -> str:
filter_line = _FILTER_TOPIC.format(topic=topic) if topic else _FILTER_GENERIC
return (
"You are a technical documentation writer. Your task is to convert raw, "
"unstructured text into a clean, well-structured Markdown document.\n\n"
f"Guidelines:\n{_BASE_GUIDELINES}\n{filter_line}"
)
def convert(raw_text: str, model: str, topic: str | None = None) -> str:
client = anthropic.Anthropic()
response = client.messages.create(
model=model,
max_tokens=8192,
system=_build_system_prompt(topic),
messages=[
{
'role': 'user',
'content': (
'Convert the following raw text into a structured, '
'chronological Markdown document:\n\n'
+ raw_text
),
}
],
)
return response.content[0].text
def main():
parser = argparse.ArgumentParser(
description='Convert raw text to structured Markdown via Claude AI'
)
src = parser.add_mutually_exclusive_group()
src.add_argument('--text', '-t', help='Raw text passed directly as a string')
src.add_argument('--input', '-i', help='Path to a plain-text input file')
parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
parser.add_argument(
'--model', '-m',
default='claude-sonnet-4-6',
help='Claude model ID (default: claude-sonnet-4-6)',
)
parser.add_argument(
'--topic', '-p',
default=None,
help='Project topic or description — used to filter out unrelated content',
)
args = parser.parse_args()
if not os.environ.get('ANTHROPIC_API_KEY'):
sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
# Resolve input
if args.text:
raw = args.text
elif args.input:
path = args.input
try:
with open(path, encoding='utf-8') as f:
raw = f.read()
except OSError as e:
sys.exit(f'Error reading {path}: {e}')
else:
if sys.stdin.isatty():
print('Paste raw text then press Ctrl+D:', file=sys.stderr)
raw = sys.stdin.read()
if not raw.strip():
sys.exit('Error: input is empty.')
print(f'Sending to {args.model}', file=sys.stderr)
result = convert(raw, args.model, topic=args.topic)
if args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(result)
print(f'Saved → {args.output}', file=sys.stderr)
except OSError as e:
sys.exit(f'Error writing {args.output}: {e}')
else:
print(result)
if __name__ == '__main__':
main()