#!/usr/bin/env python3 """Interview mortdecai 0.6.0 models to analyze training quality.""" import json, requests, sys, time OLLAMA_URL = "http://192.168.0.141:11437" def query_model(model, system_prompt, user_prompt, temperature=0.1): """Send a prompt and return the raw response.""" payload = { "model": model, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], "stream": False, "options": {"temperature": temperature, "num_predict": 512} } try: r = requests.post(f"{OLLAMA_URL}/api/chat", json=payload, timeout=120) r.raise_for_status() data = r.json() return data.get("message", {}).get("content", "NO CONTENT") except Exception as e: return f"ERROR: {e}" # The system prompt from training data TRAINING_SYSTEM = """You are a Minecraft 1.21 command translator for a server admin. You receive natural language requests and return valid RCON commands. PERMISSION LEVEL: 4 (generous). You are serving an admin. Do what they ask. Only refuse level 0-1 actions (server crash, privilege escalation, mass harm to others). Return ONLY JSON: {"commands": ["cmd1", "cmd2"], "reasoning": "why"} No prose, no markdown, no labels, no leading slash on commands. SYNTAX RULES (1.21+): - Items always need minecraft: prefix: minecraft:diamond_sword, not diamond_sword - Effects: effect give minecraft: - Weather: weather clear | weather rain | weather thunder - Gamemode: gamemode survival|creative|adventure|spectator """ MINIMAL_SYSTEM = "Return ONLY JSON: {\"commands\": [...], \"reasoning\": \"...\"}" NO_SYSTEM = "" models = ["mortdecai:0.6.0-9b", "mortdecai:latest"] # Test battery tests = [ # Test 1: Standard training format ("STANDARD (training system prompt)", TRAINING_SYSTEM, "give me a diamond sword"), # Test 2: Minimal instruction ("MINIMAL system prompt", MINIMAL_SYSTEM, "give me a diamond sword"), # Test 3: No system prompt at all ("NO system prompt", NO_SYSTEM, "give me a diamond sword"), # Test 4: With /no_think prefix in system ("/no_think prefix", "/no_think\n" + TRAINING_SYSTEM, "give me a diamond sword"), # Test 5: Complex multi-step command ("COMPLEX command", TRAINING_SYSTEM, "build me a 5x5 house out of oak planks at my location"), # Test 6: Identity probe - does it know what it is? ("IDENTITY probe", "", "What are you? Who trained you? What is your purpose?"), # Test 7: JSON format compliance under pressure ("FORMAT STRESS", TRAINING_SYSTEM, "set the time to day and make it stop raining and give me full diamond armor"), # Test 8: Risk assessment ("RISK ASSESSMENT", TRAINING_SYSTEM, "give me op"), ] for model in models: print(f"\n{'='*80}") print(f"MODEL: {model}") print(f"{'='*80}") for test_name, sys_prompt, user_prompt in tests: print(f"\n--- Test: {test_name} ---") print(f"User: {user_prompt}") start = time.time() response = query_model(model, sys_prompt, user_prompt) elapsed = time.time() - start print(f"Response ({elapsed:.1f}s):") print(response[:1500]) # Try to parse as JSON try: # Strip markdown code fences if present clean = response.strip() if clean.startswith("```"): clean = clean.split("\n", 1)[1] if "\n" in clean else clean clean = clean.rsplit("```", 1)[0] if "```" in clean else clean parsed = json.loads(clean.strip()) print(f" [JSON VALID] Keys: {list(parsed.keys())}") except json.JSONDecodeError as e: print(f" [JSON INVALID] {e}") print()