113 lines
3.5 KiB
Python
113 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
|
|
using Claude AI.
|
|
|
|
Reads from --text, --input file, or stdin. Writes to --output or stdout.
|
|
|
|
Usage:
|
|
python raw_to_md.py --text "raw notes here" --output structured.md
|
|
python raw_to_md.py --input dump.txt --output structured.md
|
|
cat dump.txt | python raw_to_md.py --output structured.md
|
|
|
|
Requires:
|
|
ANTHROPIC_API_KEY environment variable
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
import anthropic
|
|
|
|
SYSTEM_PROMPT = """\
|
|
You are a technical documentation writer. Your task is to convert raw, unstructured text \
|
|
into a clean, well-structured Markdown document.
|
|
|
|
Guidelines:
|
|
- Add a ## Summary section at the very top with a 2-3 sentence overview.
|
|
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
|
|
otherwise group by logical topic.
|
|
- Use ## for top-level sections, ### for subsections.
|
|
- Use bullet lists for discrete items or steps.
|
|
- Use **bold** for key terms, dates, names, and version numbers.
|
|
- Use `code spans` for commands, file names, and technical strings.
|
|
- Use fenced code blocks (``` … ```) for multi-line code or config samples.
|
|
- Do NOT invent or embellish information that is not in the input.
|
|
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
|
|
"""
|
|
|
|
|
|
def convert(raw_text: str, model: str) -> str:
|
|
client = anthropic.Anthropic()
|
|
response = client.messages.create(
|
|
model=model,
|
|
max_tokens=8192,
|
|
system=SYSTEM_PROMPT,
|
|
messages=[
|
|
{
|
|
'role': 'user',
|
|
'content': (
|
|
'Convert the following raw text into a structured, '
|
|
'chronological Markdown document:\n\n'
|
|
+ raw_text
|
|
),
|
|
}
|
|
],
|
|
)
|
|
return response.content[0].text
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Convert raw text to structured Markdown via Claude AI'
|
|
)
|
|
src = parser.add_mutually_exclusive_group()
|
|
src.add_argument('--text', '-t', help='Raw text passed directly as a string')
|
|
src.add_argument('--input', '-i', help='Path to a plain-text input file')
|
|
parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
|
|
parser.add_argument(
|
|
'--model', '-m',
|
|
default='claude-sonnet-4-6',
|
|
help='Claude model ID (default: claude-sonnet-4-6)',
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if not os.environ.get('ANTHROPIC_API_KEY'):
|
|
sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
|
|
|
|
# Resolve input
|
|
if args.text:
|
|
raw = args.text
|
|
elif args.input:
|
|
path = args.input
|
|
try:
|
|
with open(path, encoding='utf-8') as f:
|
|
raw = f.read()
|
|
except OSError as e:
|
|
sys.exit(f'Error reading {path}: {e}')
|
|
else:
|
|
if sys.stdin.isatty():
|
|
print('Paste raw text then press Ctrl+D:', file=sys.stderr)
|
|
raw = sys.stdin.read()
|
|
|
|
if not raw.strip():
|
|
sys.exit('Error: input is empty.')
|
|
|
|
print(f'Sending to {args.model}…', file=sys.stderr)
|
|
result = convert(raw, args.model)
|
|
|
|
if args.output:
|
|
try:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
f.write(result)
|
|
print(f'Saved → {args.output}', file=sys.stderr)
|
|
except OSError as e:
|
|
sys.exit(f'Error writing {args.output}: {e}')
|
|
else:
|
|
print(result)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|