135 lines
4.4 KiB
Python
135 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
raw_to_md.py — Convert unstructured raw text into a structured, chronological Markdown file
|
|
using Claude AI.
|
|
|
|
Reads from --text, --input file, or stdin. Writes to --output or stdout.
|
|
|
|
Usage:
|
|
python raw_to_md.py --text "raw notes here" --output structured.md
|
|
python raw_to_md.py --input dump.txt --output structured.md
|
|
cat dump.txt | python raw_to_md.py --output structured.md
|
|
|
|
Requires:
|
|
ANTHROPIC_API_KEY environment variable
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
|
|
import anthropic
|
|
|
|
_BASE_GUIDELINES = """\
|
|
- Add a ## Summary section at the very top with a 2-3 sentence overview.
|
|
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
|
|
otherwise group by logical topic.
|
|
- Use ## for top-level sections, ### for subsections.
|
|
- Use bullet lists for discrete items or steps.
|
|
- Use **bold** for key terms, dates, names, and version numbers.
|
|
- Use `code spans` for commands, file names, and technical strings.
|
|
- Use fenced code blocks (``` … ```) for multi-line code or config samples.
|
|
- Do NOT invent or embellish information that is not in the input.
|
|
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
|
|
"""
|
|
|
|
_FILTER_GENERIC = (
|
|
"- Silently discard any passages that are clearly noise, off-topic, or unrelated "
|
|
"to the main subject of the input (e.g. unrelated chat messages, ads, system logs "
|
|
"from other contexts). Do not mention the omission."
|
|
)
|
|
|
|
_FILTER_TOPIC = (
|
|
"- This document is about: {topic}. "
|
|
"Silently discard any passages that are clearly unrelated to this topic. "
|
|
"Do not mention the omission."
|
|
)
|
|
|
|
|
|
def _build_system_prompt(topic: str | None) -> str:
|
|
filter_line = _FILTER_TOPIC.format(topic=topic) if topic else _FILTER_GENERIC
|
|
return (
|
|
"You are a technical documentation writer. Your task is to convert raw, "
|
|
"unstructured text into a clean, well-structured Markdown document.\n\n"
|
|
f"Guidelines:\n{_BASE_GUIDELINES}\n{filter_line}"
|
|
)
|
|
|
|
|
|
def convert(raw_text: str, model: str, topic: str | None = None) -> str:
|
|
client = anthropic.Anthropic()
|
|
response = client.messages.create(
|
|
model=model,
|
|
max_tokens=8192,
|
|
system=_build_system_prompt(topic),
|
|
messages=[
|
|
{
|
|
'role': 'user',
|
|
'content': (
|
|
'Convert the following raw text into a structured, '
|
|
'chronological Markdown document:\n\n'
|
|
+ raw_text
|
|
),
|
|
}
|
|
],
|
|
)
|
|
return response.content[0].text
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Convert raw text to structured Markdown via Claude AI'
|
|
)
|
|
src = parser.add_mutually_exclusive_group()
|
|
src.add_argument('--text', '-t', help='Raw text passed directly as a string')
|
|
src.add_argument('--input', '-i', help='Path to a plain-text input file')
|
|
parser.add_argument('--output', '-o', help='Output .md file (default: stdout)')
|
|
parser.add_argument(
|
|
'--model', '-m',
|
|
default='claude-sonnet-4-6',
|
|
help='Claude model ID (default: claude-sonnet-4-6)',
|
|
)
|
|
parser.add_argument(
|
|
'--topic', '-p',
|
|
default=None,
|
|
help='Project topic or description — used to filter out unrelated content',
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
if not os.environ.get('ANTHROPIC_API_KEY'):
|
|
sys.exit('Error: ANTHROPIC_API_KEY environment variable is not set.')
|
|
|
|
# Resolve input
|
|
if args.text:
|
|
raw = args.text
|
|
elif args.input:
|
|
path = args.input
|
|
try:
|
|
with open(path, encoding='utf-8') as f:
|
|
raw = f.read()
|
|
except OSError as e:
|
|
sys.exit(f'Error reading {path}: {e}')
|
|
else:
|
|
if sys.stdin.isatty():
|
|
print('Paste raw text then press Ctrl+D:', file=sys.stderr)
|
|
raw = sys.stdin.read()
|
|
|
|
if not raw.strip():
|
|
sys.exit('Error: input is empty.')
|
|
|
|
print(f'Sending to {args.model}…', file=sys.stderr)
|
|
result = convert(raw, args.model, topic=args.topic)
|
|
|
|
if args.output:
|
|
try:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
f.write(result)
|
|
print(f'Saved → {args.output}', file=sys.stderr)
|
|
except OSError as e:
|
|
sys.exit(f'Error writing {args.output}: {e}')
|
|
else:
|
|
print(result)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|