Add --topic flag to raw_to_md for filtering off-topic content

When --topic is provided Claude silently discards passages unrelated to
that subject. Without it a generic noise-filter instruction is used.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
main
Amir Alexander Abdelbaki 2026-06-01 14:15:49 +02:00
parent 5909a3ac9b
commit 8ec0d3be41
1 changed files with 30 additions and 8 deletions

View File

@ -20,11 +20,7 @@ import sys
import anthropic import anthropic
SYSTEM_PROMPT = """\ _BASE_GUIDELINES = """\
You are a technical documentation writer. Your task is to convert raw, unstructured text \
into a clean, well-structured Markdown document.
Guidelines:
- Add a ## Summary section at the very top with a 2-3 sentence overview. - Add a ## Summary section at the very top with a 2-3 sentence overview.
- Organise the rest chronologically when dates, versions, or sequence cues are present; \ - Organise the rest chronologically when dates, versions, or sequence cues are present; \
otherwise group by logical topic. otherwise group by logical topic.
@ -37,13 +33,34 @@ Guidelines:
- Output ONLY the Markdown no preamble, explanation, or trailing commentary.\ - Output ONLY the Markdown no preamble, explanation, or trailing commentary.\
""" """
_FILTER_GENERIC = (
"- Silently discard any passages that are clearly noise, off-topic, or unrelated "
"to the main subject of the input (e.g. unrelated chat messages, ads, system logs "
"from other contexts). Do not mention the omission."
)
def convert(raw_text: str, model: str) -> str: _FILTER_TOPIC = (
"- This document is about: {topic}. "
"Silently discard any passages that are clearly unrelated to this topic. "
"Do not mention the omission."
)
def _build_system_prompt(topic: str | None) -> str:
filter_line = _FILTER_TOPIC.format(topic=topic) if topic else _FILTER_GENERIC
return (
"You are a technical documentation writer. Your task is to convert raw, "
"unstructured text into a clean, well-structured Markdown document.\n\n"
f"Guidelines:\n{_BASE_GUIDELINES}\n{filter_line}"
)
def convert(raw_text: str, model: str, topic: str | None = None) -> str:
client = anthropic.Anthropic() client = anthropic.Anthropic()
response = client.messages.create( response = client.messages.create(
model=model, model=model,
max_tokens=8192, max_tokens=8192,
system=SYSTEM_PROMPT, system=_build_system_prompt(topic),
messages=[ messages=[
{ {
'role': 'user', 'role': 'user',
@ -71,6 +88,11 @@ def main():
default='claude-sonnet-4-6', default='claude-sonnet-4-6',
help='Claude model ID (default: claude-sonnet-4-6)', help='Claude model ID (default: claude-sonnet-4-6)',
) )
parser.add_argument(
'--topic', '-p',
default=None,
help='Project topic or description — used to filter out unrelated content',
)
args = parser.parse_args() args = parser.parse_args()
if not os.environ.get('ANTHROPIC_API_KEY'): if not os.environ.get('ANTHROPIC_API_KEY'):
@ -95,7 +117,7 @@ def main():
sys.exit('Error: input is empty.') sys.exit('Error: input is empty.')
print(f'Sending to {args.model}', file=sys.stderr) print(f'Sending to {args.model}', file=sys.stderr)
result = convert(raw, args.model) result = convert(raw, args.model, topic=args.topic)
if args.output: if args.output:
try: try: