Add --topic flag to raw_to_md for filtering off-topic content
When --topic is provided Claude silently discards passages unrelated to that subject. Without it a generic noise-filter instruction is used. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>main
parent
5909a3ac9b
commit
8ec0d3be41
38
raw_to_md.py
38
raw_to_md.py
|
|
@ -20,11 +20,7 @@ import sys
|
||||||
|
|
||||||
import anthropic
|
import anthropic
|
||||||
|
|
||||||
SYSTEM_PROMPT = """\
|
_BASE_GUIDELINES = """\
|
||||||
You are a technical documentation writer. Your task is to convert raw, unstructured text \
|
|
||||||
into a clean, well-structured Markdown document.
|
|
||||||
|
|
||||||
Guidelines:
|
|
||||||
- Add a ## Summary section at the very top with a 2-3 sentence overview.
|
- Add a ## Summary section at the very top with a 2-3 sentence overview.
|
||||||
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
|
- Organise the rest chronologically when dates, versions, or sequence cues are present; \
|
||||||
otherwise group by logical topic.
|
otherwise group by logical topic.
|
||||||
|
|
@ -37,13 +33,34 @@ Guidelines:
|
||||||
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
|
- Output ONLY the Markdown — no preamble, explanation, or trailing commentary.\
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
_FILTER_GENERIC = (
|
||||||
|
"- Silently discard any passages that are clearly noise, off-topic, or unrelated "
|
||||||
|
"to the main subject of the input (e.g. unrelated chat messages, ads, system logs "
|
||||||
|
"from other contexts). Do not mention the omission."
|
||||||
|
)
|
||||||
|
|
||||||
def convert(raw_text: str, model: str) -> str:
|
_FILTER_TOPIC = (
|
||||||
|
"- This document is about: {topic}. "
|
||||||
|
"Silently discard any passages that are clearly unrelated to this topic. "
|
||||||
|
"Do not mention the omission."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_system_prompt(topic: str | None) -> str:
|
||||||
|
filter_line = _FILTER_TOPIC.format(topic=topic) if topic else _FILTER_GENERIC
|
||||||
|
return (
|
||||||
|
"You are a technical documentation writer. Your task is to convert raw, "
|
||||||
|
"unstructured text into a clean, well-structured Markdown document.\n\n"
|
||||||
|
f"Guidelines:\n{_BASE_GUIDELINES}\n{filter_line}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert(raw_text: str, model: str, topic: str | None = None) -> str:
|
||||||
client = anthropic.Anthropic()
|
client = anthropic.Anthropic()
|
||||||
response = client.messages.create(
|
response = client.messages.create(
|
||||||
model=model,
|
model=model,
|
||||||
max_tokens=8192,
|
max_tokens=8192,
|
||||||
system=SYSTEM_PROMPT,
|
system=_build_system_prompt(topic),
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
'role': 'user',
|
'role': 'user',
|
||||||
|
|
@ -71,6 +88,11 @@ def main():
|
||||||
default='claude-sonnet-4-6',
|
default='claude-sonnet-4-6',
|
||||||
help='Claude model ID (default: claude-sonnet-4-6)',
|
help='Claude model ID (default: claude-sonnet-4-6)',
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--topic', '-p',
|
||||||
|
default=None,
|
||||||
|
help='Project topic or description — used to filter out unrelated content',
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if not os.environ.get('ANTHROPIC_API_KEY'):
|
if not os.environ.get('ANTHROPIC_API_KEY'):
|
||||||
|
|
@ -95,7 +117,7 @@ def main():
|
||||||
sys.exit('Error: input is empty.')
|
sys.exit('Error: input is empty.')
|
||||||
|
|
||||||
print(f'Sending to {args.model}…', file=sys.stderr)
|
print(f'Sending to {args.model}…', file=sys.stderr)
|
||||||
result = convert(raw, args.model)
|
result = convert(raw, args.model, topic=args.topic)
|
||||||
|
|
||||||
if args.output:
|
if args.output:
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue