Add LoRA training scripts and fix bake-off token budget
- training/scripts/train_lora.py: Unsloth QLoRA trainer for qwen3:8b - training/scripts/train_lora.sh: Launch script for steel141 RTX 3090 Ti - eval/bakeoff.py: Fixed token budget (400->1500) that caused qwen3 models to exhaust tokens on thinking, added --no-think flag - agent/serve.py: Default model changed to gemma3n:e4b Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+15
-4
@@ -31,7 +31,8 @@ RESULTS_DIR = ROOT / "eval" / "results"
|
||||
|
||||
|
||||
def ollama_chat(model: str, messages: list, ollama_url: str,
|
||||
temperature: float = 0.2, max_tokens: int = 400) -> dict:
|
||||
temperature: float = 0.2, max_tokens: int = 1500,
|
||||
no_think: bool = False) -> dict:
|
||||
"""Call Ollama and return response + timing."""
|
||||
payload = {
|
||||
"model": model,
|
||||
@@ -43,6 +44,12 @@ def ollama_chat(model: str, messages: list, ollama_url: str,
|
||||
"num_predict": max_tokens,
|
||||
},
|
||||
}
|
||||
if no_think:
|
||||
# Prepend /no_think to the last user message to disable thinking tokens
|
||||
for msg in reversed(payload["messages"]):
|
||||
if msg["role"] == "user":
|
||||
msg["content"] = "/no_think\n" + msg["content"]
|
||||
break
|
||||
start = time.time()
|
||||
r = requests.post(f"{ollama_url}/api/chat", json=payload, timeout=180)
|
||||
r.raise_for_status()
|
||||
@@ -157,7 +164,7 @@ def score_result(example: dict, actual_cmds: list, parsed: dict) -> dict:
|
||||
}
|
||||
|
||||
|
||||
def run_bakeoff(models: list, ollama_url: str):
|
||||
def run_bakeoff(models: list, ollama_url: str, no_think: bool = False):
|
||||
"""Run all models against the dataset and compare."""
|
||||
# Load dataset
|
||||
with open(DATASET) as f:
|
||||
@@ -166,6 +173,8 @@ def run_bakeoff(models: list, ollama_url: str):
|
||||
print(f"Bake-off: {len(examples)} examples × {len(models)} models")
|
||||
print(f"Ollama: {ollama_url}")
|
||||
print(f"Models: {', '.join(models)}")
|
||||
if no_think:
|
||||
print("Mode: /no_think (thinking tokens disabled)")
|
||||
print("=" * 70)
|
||||
|
||||
all_results = {}
|
||||
@@ -208,7 +217,7 @@ def run_bakeoff(models: list, ollama_url: str):
|
||||
|
||||
# Call LLM
|
||||
try:
|
||||
resp = ollama_chat(model, messages, ollama_url)
|
||||
resp = ollama_chat(model, messages, ollama_url, no_think=no_think)
|
||||
except Exception as e:
|
||||
print(f" [{i+1}/{len(examples)}] ERROR: {e}")
|
||||
results.append({"id": eid, "error": str(e)})
|
||||
@@ -311,9 +320,11 @@ def main():
|
||||
parser.add_argument("--ollama-url", default="http://192.168.0.179:11434")
|
||||
parser.add_argument("--models", nargs="+",
|
||||
default=["qwen3-coder:30b", "gemma3n:e4b"])
|
||||
parser.add_argument("--no-think", action="store_true",
|
||||
help="Prepend /no_think to disable thinking tokens (helps Qwen models)")
|
||||
args = parser.parse_args()
|
||||
|
||||
run_bakeoff(args.models, args.ollama_url)
|
||||
run_bakeoff(args.models, args.ollama_url, no_think=args.no_think)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user