docs: Mortdecai 0.6.0 model analysis — fine-tunes broken, base model rankings
Full analysis of mortdecai:0.6.0-9b and mortdecai:latest (27B) fine-tunes vs 6 base model candidates. Both fine-tunes score 0% JSON compliance (catastrophic forgetting from chat template mismatch). Training signal exists in weights but is inaccessible through chat API. Base model rankings: phi4:14b (100%, 7.4s) > gemma3:12b (100%, 12.9s) > gemma3:27b (100%, 25.3s). Qwen3.5 not recommended for conductor role. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,133 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Interview base models for comparison against fine-tuned mortdecai."""
|
||||
import json, requests, sys, time
|
||||
|
||||
OLLAMA_URL = "http://192.168.0.141:11437"
|
||||
|
||||
def query_model(model, system_prompt, user_prompt, temperature=0.1):
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
],
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature, "num_predict": 512}
|
||||
}
|
||||
try:
|
||||
r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=180)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
content = data.get("message", {}).get("content", "NO CONTENT")
|
||||
total_dur = data.get("total_duration", 0) / 1e9 # nanoseconds to seconds
|
||||
eval_count = data.get("eval_count", 0)
|
||||
return content, total_dur, eval_count
|
||||
except Exception as e:
|
||||
return f"ERROR: {e}", 0, 0
|
||||
|
||||
TRAINING_SYSTEM = """You are a Minecraft 1.21 command translator for a server admin. You receive natural language requests and return valid RCON commands.
|
||||
|
||||
PERMISSION LEVEL: 4 (generous). You are serving an admin. Do what they ask. Only refuse level 0-1 actions.
|
||||
|
||||
Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"}
|
||||
No prose, no markdown, no labels, no leading slash on commands.
|
||||
|
||||
SYNTAX RULES (1.21+):
|
||||
- Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword
|
||||
- Effects: effect give <target> minecraft:<effect> <seconds> <amplifier>
|
||||
- Weather: weather clear | weather rain | weather thunder
|
||||
- Gamemode: gamemode survival|creative|adventure|spectator <target>"""
|
||||
|
||||
# Test prompts
|
||||
test_prompts = [
|
||||
"give me a diamond sword",
|
||||
"set the time to day and make it stop raining and give me full diamond armor",
|
||||
"build me a 5x5 house out of oak planks at my location",
|
||||
"give me op",
|
||||
"teleport all players to me",
|
||||
]
|
||||
|
||||
# Models to test (base models on Matt's machine)
|
||||
base_models = [
|
||||
"qwen3.5:latest", # 9B base (should be same arch as mortdecai:0.6.0-9b)
|
||||
"qwen3.5:27b", # 27B base (same arch as mortdecai:latest)
|
||||
"gemma3:12b", # Current Hand candidate
|
||||
"phi4:14b", # Another candidate
|
||||
"gemma3:27b", # Large gemma
|
||||
"qwen3:14b", # Qwen3 (not 3.5)
|
||||
]
|
||||
|
||||
results = {}
|
||||
|
||||
for model in base_models:
|
||||
print(f"\n{'='*80}")
|
||||
print(f"MODEL: {model}")
|
||||
print(f"{'='*80}")
|
||||
model_results = []
|
||||
|
||||
for prompt in test_prompts:
|
||||
print(f"\n User: {prompt}")
|
||||
response, duration, tokens = query_model(model, TRAINING_SYSTEM, prompt)
|
||||
|
||||
# Check JSON validity
|
||||
json_valid = False
|
||||
has_commands = False
|
||||
commands_correct = False
|
||||
clean = response.strip()
|
||||
|
||||
# Strip think tags if present
|
||||
if "<think>" in clean:
|
||||
think_end = clean.find("</think>")
|
||||
if think_end > -1:
|
||||
clean = clean[think_end + 8:].strip()
|
||||
|
||||
# Strip markdown fences
|
||||
if clean.startswith("```"):
|
||||
lines = clean.split("\n")
|
||||
clean = "\n".join(lines[1:])
|
||||
if "```" in clean:
|
||||
clean = clean[:clean.rfind("```")]
|
||||
clean = clean.strip()
|
||||
|
||||
try:
|
||||
parsed = json.loads(clean)
|
||||
json_valid = True
|
||||
has_commands = "commands" in parsed
|
||||
if has_commands:
|
||||
cmds = parsed["commands"]
|
||||
# Check if commands look valid (have minecraft: prefix where needed)
|
||||
commands_correct = all(isinstance(c, str) for c in cmds)
|
||||
except:
|
||||
pass
|
||||
|
||||
status = "JSON_VALID" if json_valid else "JSON_INVALID"
|
||||
if json_valid and has_commands:
|
||||
status += "+COMMANDS"
|
||||
if json_valid and not has_commands:
|
||||
status += "+NO_CMDS"
|
||||
|
||||
print(f" [{status}] {duration:.1f}s, {tokens} tokens")
|
||||
print(f" Response: {response[:300]}")
|
||||
|
||||
model_results.append({
|
||||
"prompt": prompt,
|
||||
"json_valid": json_valid,
|
||||
"has_commands": has_commands,
|
||||
"duration": duration,
|
||||
"tokens": tokens
|
||||
})
|
||||
|
||||
results[model] = model_results
|
||||
|
||||
# Summary table
|
||||
print(f"\n\n{'='*80}")
|
||||
print("SUMMARY TABLE")
|
||||
print(f"{'='*80}")
|
||||
print(f"{'Model':<25} {'JSON Valid':>10} {'Has Cmds':>10} {'Avg Time':>10}")
|
||||
print("-" * 60)
|
||||
for model, res in results.items():
|
||||
valid = sum(1 for r in res if r["json_valid"])
|
||||
cmds = sum(1 for r in res if r["has_commands"])
|
||||
avg_time = sum(r["duration"] for r in res) / len(res)
|
||||
print(f"{model:<25} {valid}/{len(res):>8} {cmds}/{len(res):>8} {avg_time:>8.1f}s")
|
||||
|
||||
Reference in New Issue
Block a user