#!/usr/bin/env python3 """Interview base models for comparison against fine-tuned mortdecai.""" import json, requests, sys, time OLLAMA_URL = "http://192.168.0.141:11437" def query_model(model, system_prompt, user_prompt, temperature=0.1): payload = { "model": model, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], "stream": False, "options": {"temperature": temperature, "num_predict": 512} } try: r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=180) r.raise_for_status() data = r.json() content = data.get("message", {}).get("content", "NO CONTENT") total_dur = data.get("total_duration", 0) / 1e9 # nanoseconds to seconds eval_count = data.get("eval_count", 0) return content, total_dur, eval_count except Exception as e: return f"ERROR: {e}", 0, 0 TRAINING_SYSTEM = """You are a Minecraft 1.21 command translator for a server admin. You receive natural language requests and return valid RCON commands. PERMISSION LEVEL: 4 (generous). You are serving an admin. Do what they ask. Only refuse level 0-1 actions. Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"} No prose, no markdown, no labels, no leading slash on commands. SYNTAX RULES (1.21+): - Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword - Effects: effect give minecraft: - Weather: weather clear | weather rain | weather thunder - Gamemode: gamemode survival|creative|adventure|spectator """ # Test prompts test_prompts = [ "give me a diamond sword", "set the time to day and make it stop raining and give me full diamond armor", "build me a 5x5 house out of oak planks at my location", "give me op", "teleport all players to me", ] # Models to test (base models on Matt's machine) base_models = [ "qwen3.5:latest", # 9B base (should be same arch as mortdecai:0.6.0-9b) "qwen3.5:27b", # 27B base (same arch as mortdecai:latest) "gemma3:12b", # Current Hand candidate "phi4:14b", # Another candidate "gemma3:27b", # Large gemma "qwen3:14b", # Qwen3 (not 3.5) ] results = {} for model in base_models: print(f"\n{'='*80}") print(f"MODEL: {model}") print(f"{'='*80}") model_results = [] for prompt in test_prompts: print(f"\n User: {prompt}") response, duration, tokens = query_model(model, TRAINING_SYSTEM, prompt) # Check JSON validity json_valid = False has_commands = False commands_correct = False clean = response.strip() # Strip think tags if present if "" in clean: think_end = clean.find("") if think_end > -1: clean = clean[think_end + 8:].strip() # Strip markdown fences if clean.startswith("```"): lines = clean.split("\n") clean = "\n".join(lines[1:]) if "```" in clean: clean = clean[:clean.rfind("```")] clean = clean.strip() try: parsed = json.loads(clean) json_valid = True has_commands = "commands" in parsed if has_commands: cmds = parsed["commands"] # Check if commands look valid (have minecraft: prefix where needed) commands_correct = all(isinstance(c, str) for c in cmds) except: pass status = "JSON_VALID" if json_valid else "JSON_INVALID" if json_valid and has_commands: status += "+COMMANDS" if json_valid and not has_commands: status += "+NO_CMDS" print(f" [{status}] {duration:.1f}s, {tokens} tokens") print(f" Response: {response[:300]}") model_results.append({ "prompt": prompt, "json_valid": json_valid, "has_commands": has_commands, "duration": duration, "tokens": tokens }) results[model] = model_results # Summary table print(f"\n\n{'='*80}") print("SUMMARY TABLE") print(f"{'='*80}") print(f"{'Model':<25} {'JSON Valid':>10} {'Has Cmds':>10} {'Avg Time':>10}") print("-" * 60) for model, res in results.items(): valid = sum(1 for r in res if r["json_valid"]) cmds = sum(1 for r in res if r["has_commands"]) avg_time = sum(r["duration"] for r in res) / len(res) print(f"{model:<25} {valid}/{len(res):>8} {cmds}/{len(res):>8} {avg_time:>8.1f}s")