#!/usr/bin/env python3
"""claudemd-hygiene-check.py — auto-load hygiene checker for Claude Code workspaces.

WHAT
  Scans the files Claude Code auto-loads at session start (CLAUDE.md and friends) for
  language that safety classifiers tend to read as cybersecurity / biology signal, plus
  a settings.json "model" sanity check. Prints findings with line numbers and neutral
  rewrite suggestions. SUGGEST-ONLY: it never modifies your files.

WHY
  Claude Fable 5 runs safety classifiers over EVERYTHING the model reads — including the
  workspace context (CLAUDE.md, memory) that is sent with your very first message. A
  repository whose docs have accumulated combat metaphors ("attack the problem", "kill
  targets") or mechanism-level security/biology prose can trip the classifier before you
  type anything, and the session silently continues on Opus. Cleaning your own standing
  language ("workspace hygiene") removes the false-positive surface for legitimate work.

  This tool is for FALSE POSITIVES on legitimate work only. If your work genuinely is
  offensive security or biology, the fallback is expected, documented routing — do not
  use wording changes to misrepresent what your project does.

USAGE
  python claudemd-hygiene-check.py                # scan ./CLAUDE.md + ~/.claude/CLAUDE.md + nested CLAUDE.md
  python claudemd-hygiene-check.py file1.md ...   # scan specific files
  Exit code: 0 = clean, 1 = findings (CI-friendly).

  From: https://tagmac.dev/guides/why-claude-fable-falls-back-to-opus  (June 2026)
"""
import json
import re
import sys
from pathlib import Path

try:
    sys.stdout.reconfigure(encoding="utf-8", errors="replace")  # Windows console guard
except Exception:
    pass

# ---- pattern groups: (regex, why it reads as signal, neutral suggestion) -------------
COMBAT = [
    (r"\battack(s|ed|ing)?\b", "combat metaphor", 'use "approach / address / tackle"'),
    (r"\bkill(s|ed|ing)?\b(?!\s*(switch|-9))", "combat metaphor", 'use "stop / end / close"'),
    (r"\btargets?\b", "combat metaphor (context-dependent)", 'use "goal / the page / the audience"'),
    (r"\bweapon(s|ize[ds]?)?\b", "combat metaphor", 'use "tool / apply"'),
    (r"\bdestroy(s|ed|ing)?\b", "combat metaphor", 'use "remove / replace"'),
    (r"\bwar\b|\bbattle\b|\bcombat\b", "combat metaphor", "describe the actual activity"),
    (r"\bambush\b|\bassault\b", "combat metaphor", "describe the actual activity"),
]
SECURITY = [
    (r"\bexploit(s|ed|ing|ation)?\b", "offensive-security term", 'use "abuse / misuse" or name the bug class'),
    (r"\bmalware\b|\bransomware\b|\brootkit\b", "offensive-security term", "name the defensive concern instead"),
    (r"\bpayloads?\b", "offensive-security term (context-dependent)", 'use "request body / message content"'),
    (r"\bbackdoors?\b", "offensive-security term", 'use "undocumented access path"'),
    (r"\bbrute[- ]?force\b", "offensive-security term", 'use "repeated-guess / exhaustive try"'),
    (r"\bpenetration test(s|ing)?\b|\bpentest(s|ing)?\b", "offensive-security term", 'use "authorized security assessment"'),
    (r"\bbypass(es|ed|ing)?\b", "evasion-flavored verb", 'use "skip / go around (of a benign step)" or name the step'),
    (r"\bhijack(s|ed|ing)?\b", "offensive-security term", 'use "take over (of a UI element/process)"'),
    (r"\bphishing\b|\bspoof(s|ed|ing)?\b", "offensive-security term", "name the defensive concern instead"),
    (r"\bC2\b|\bcommand[- ]and[- ]control\b", "offensive-security term", "describe the legitimate control flow"),
    (r"\bexfiltrat(e|es|ed|ion)\b", "offensive-security term", 'use "export / copy out"'),
    (r"\battack surface\b", "security-mechanism phrase", 'use "exposure / exposed surface"'),
    (r"\bhoneypots?\b", "security-mechanism term", 'use "decoy endpoint"'),
]
BIOLOGY = [
    (r"\bimmune (system|layer|response|cells?)\b", "biology-mechanism metaphor", 'use "self-guard / self-heal layer"'),
    (r"\bpathogens?\b", "biology-mechanism term", 'use "external fault source"'),
    (r"\bantibod(y|ies)\b|\bT[- ]cells?\b|\blymphocytes?\b", "biology-mechanism term", "use a plain systems word (detector, monitor)"),
    (r"\bapoptosis\b", "biology-mechanism term", 'use "cleanup / self-termination"'),
    (r"\b(viral|virus) (load|replication|vector)\b", "biology-mechanism phrase", "use a plain systems word"),
    (r"\bmolecular mechanism(s)?\b|\blab protocols?\b|\bcell cultures?\b", "life-science mechanism phrase", "keep mechanism detail OUT of auto-loaded docs"),
    (r"\bgene (editing|expression)\b|\bCRISPR\b|\bplasmids?\b", "life-science mechanism term", "keep mechanism detail OUT of auto-loaded docs"),
]
GROUPS = [("combat-metaphor", COMBAT), ("security-mechanism", SECURITY), ("biology-mechanism", BIOLOGY)]
_compiled = [(g, re.compile(p, re.I), why, fix) for g, pats in GROUPS for p, why, fix in pats]

# protected spans: fenced code, inline code, wiki-links, URLs — identifiers stay intact
PROTECT = re.compile(r"(```.*?```|`[^`\n]*`|\[\[[^\]]+\]\]|https?://\S+)", re.S)


def prose_only(text):
    """Blank out protected spans (keep offsets/line numbers stable)."""
    def blank(m):
        return re.sub(r"[^\n]", " ", m.group(0))
    return PROTECT.sub(blank, text)


def scan_file(path):
    findings = []
    try:
        text = Path(path).read_text(encoding="utf-8", errors="replace")
    except OSError as e:
        print(f"  !! cannot read {path}: {e}")
        return None
    prose = prose_only(text)
    lines = prose.splitlines()
    for ln, line in enumerate(lines, 1):
        for group, rx, why, fix in _compiled:
            for m in rx.finditer(line):
                findings.append((ln, group, m.group(0), why, fix))
    return findings


def check_settings(path):
    """settings.json 'model' sanity: escape chars / suspicious suffixes silently break model selection."""
    p = Path(path)
    if not p.exists():
        return []
    out = []
    try:
        model = json.loads(p.read_text(encoding="utf-8")).get("model")
    except Exception as e:
        return [f"{path}: not valid JSON ({e})"]
    if not isinstance(model, str):
        return []
    if any(ord(c) < 32 for c in model):
        out.append(f'{path}: "model" contains control/escape characters: {model!r} — rewrite it by hand (e.g. "claude-fable-5")')
    m = re.match(r"^(.*?)(\[1m\])?$", model)
    base = m.group(1)
    if m.group(2) and "fable" in base:
        out.append(f'{path}: "model" is {model!r} — the [1m] suffix is documented for opus/sonnet; on other ids it may not be recognized and the session can start on your tier default. If sessions are not starting on the model you saved, try the plain id.')
    return out


def main(argv):
    targets = [Path(a) for a in argv] or [
        p for p in [Path("CLAUDE.md"), Path.home() / ".claude" / "CLAUDE.md"] if p.exists()
    ] + sorted(Path(".").glob("*/CLAUDE.md"))
    total = 0
    for t in targets:
        f = scan_file(t)
        if f is None:
            continue
        if not f:
            print(f"  ok  {t}")
            continue
        total += len(f)
        print(f"\n  {t} — {len(f)} finding(s):")
        for ln, group, word, why, fix in f:
            print(f"    L{ln:<4} [{group}] \"{word}\" — {why}; {fix}")
    for note in check_settings(Path.home() / ".claude" / "settings.json"):
        total += 1
        print(f"\n  settings: {note}")
    print(f"\n{'CLEAN — auto-load surface looks neutral.' if not total else str(total) + ' finding(s). Suggestions only — edit by hand, keep code/identifiers as they are.'}")
    return 1 if total else 0


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))