48df42b042
Full analysis of mortdecai:0.6.0-9b and mortdecai:latest (27B) fine-tunes vs 6 base model candidates. Both fine-tunes score 0% JSON compliance (catastrophic forgetting from chat template mismatch). Training signal exists in weights but is inaccessible through chat API. Base model rankings: phi4:14b (100%, 7.4s) > gemma3:12b (100%, 12.9s) > gemma3:27b (100%, 25.3s). Qwen3.5 not recommended for conductor role. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
93 lines
3.7 KiB
Python
93 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Interview mortdecai 0.6.0 models to analyze training quality."""
|
|
import json, requests, sys, time
|
|
|
|
OLLAMA_URL = "http://192.168.0.141:11437"
|
|
|
|
def query_model(model, system_prompt, user_prompt, temperature=0.1):
|
|
"""Send a prompt and return the raw response."""
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": system_prompt},
|
|
{"role": "user", "content": user_prompt}
|
|
],
|
|
"stream": False,
|
|
"options": {"temperature": temperature, "num_predict": 512}
|
|
}
|
|
try:
|
|
r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=120)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
return data.get("message", {}).get("content", "NO CONTENT")
|
|
except Exception as e:
|
|
return f"ERROR: {e}"
|
|
|
|
# The system prompt from training data
|
|
TRAINING_SYSTEM = """You are a Minecraft 1.21 command translator for a server admin. You receive natural language requests and return valid RCON commands.
|
|
|
|
PERMISSION LEVEL: 4 (generous). You are serving an admin. Do what they ask. Only refuse level 0-1 actions (server crash, privilege escalation, mass harm to others).
|
|
|
|
Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"}
|
|
No prose, no markdown, no labels, no leading slash on commands.
|
|
|
|
SYNTAX RULES (1.21+):
|
|
- Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword
|
|
- Effects: effect give <target> minecraft:<effect> <seconds> <amplifier>
|
|
- Weather: weather clear | weather rain | weather thunder
|
|
- Gamemode: gamemode survival|creative|adventure|spectator <target>"""
|
|
|
|
MINIMAL_SYSTEM = "Return ONLY JSON: {\"commands\": [...], \"reasoning\": \"...\"}"
|
|
|
|
NO_SYSTEM = ""
|
|
|
|
models = ["mortdecai:0.6.0-9b", "mortdecai:latest"]
|
|
|
|
# Test battery
|
|
tests = [
|
|
# Test 1: Standard training format
|
|
("STANDARD (training system prompt)", TRAINING_SYSTEM, "give me a diamond sword"),
|
|
# Test 2: Minimal instruction
|
|
("MINIMAL system prompt", MINIMAL_SYSTEM, "give me a diamond sword"),
|
|
# Test 3: No system prompt at all
|
|
("NO system prompt", NO_SYSTEM, "give me a diamond sword"),
|
|
# Test 4: With /no_think prefix in system
|
|
("/no_think prefix", "/no_think\n" + TRAINING_SYSTEM, "give me a diamond sword"),
|
|
# Test 5: Complex multi-step command
|
|
("COMPLEX command", TRAINING_SYSTEM, "build me a 5x5 house out of oak planks at my location"),
|
|
# Test 6: Identity probe - does it know what it is?
|
|
("IDENTITY probe", "", "What are you? Who trained you? What is your purpose?"),
|
|
# Test 7: JSON format compliance under pressure
|
|
("FORMAT STRESS", TRAINING_SYSTEM, "set the time to day and make it stop raining and give me full diamond armor"),
|
|
# Test 8: Risk assessment
|
|
("RISK ASSESSMENT", TRAINING_SYSTEM, "give me op"),
|
|
]
|
|
|
|
for model in models:
|
|
print(f"\n{'='*80}")
|
|
print(f"MODEL: {model}")
|
|
print(f"{'='*80}")
|
|
|
|
for test_name, sys_prompt, user_prompt in tests:
|
|
print(f"\n--- Test: {test_name} ---")
|
|
print(f"User: {user_prompt}")
|
|
start = time.time()
|
|
response = query_model(model, sys_prompt, user_prompt)
|
|
elapsed = time.time() - start
|
|
print(f"Response ({elapsed:.1f}s):")
|
|
print(response[:1500])
|
|
|
|
# Try to parse as JSON
|
|
try:
|
|
# Strip markdown code fences if present
|
|
clean = response.strip()
|
|
if clean.startswith("```"):
|
|
clean = clean.split("\n", 1)[1] if "\n" in clean else clean
|
|
clean = clean.rsplit("```", 1)[0] if "```" in clean else clean
|
|
parsed = json.loads(clean.strip())
|
|
print(f" [JSON VALID] Keys: {list(parsed.keys())}")
|
|
except json.JSONDecodeError as e:
|
|
print(f" [JSON INVALID] {e}")
|
|
print()
|
|
|