[ { "case_id": "judge-y1", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6691536903381348, "eval_count": 31, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are common names of tree species.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y2", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6203711032867432, "eval_count": 30, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are names of Greek letters.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y3", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.7630858421325684, "eval_count": 48, "ok": true, "parsed": { "verdict": "no", "reason": "The words are not all representative of the days of the week.", "misfit_words": [ "MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY" ] }, "raw": null, "error": null, "score": "FAIL" }, { "case_id": "judge-y4", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6369147300720215, "eval_count": 34, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are direct and common synonyms for 'happy'.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y5", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6616277694702148, "eval_count": 35, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words can precede 'HOUSE' to form common compound words.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y6", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.7502412796020508, "eval_count": 45, "ok": true, "parsed": { "verdict": "no", "reason": "Only SHOULDER, FRONT, and SNAP fit common phrases with COLD, while TURKEY does not.", "misfit_words": [ "TURKEY" ] }, "raw": null, "error": null, "score": "FAIL" }, { "case_id": "judge-y7", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.619704008102417, "eval_count": 31, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are strong synonyms for moving rapidly.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y8", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6541116237640381, "eval_count": 34, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are common synonyms for a mental concept or idea.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n1", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6654887199401855, "eval_count": 36, "ok": true, "parsed": { "verdict": "no", "reason": "Carrot is a root vegetable, not a type of tree.", "misfit_words": [ "CARROT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n2", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6903350353240967, "eval_count": 38, "ok": true, "parsed": { "verdict": "no", "reason": "Cyrillic is a script, not a Greek letter.", "misfit_words": [ "CYRILLIC" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n3", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.7092163562774658, "eval_count": 42, "ok": true, "parsed": { "verdict": "no", "reason": "The word 'SAD' is an antonym, not a synonym for 'happy'.", "misfit_words": [ "SAD" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n4", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6665847301483154, "eval_count": 35, "ok": true, "parsed": { "verdict": "no", "reason": "January is a month, not a day of the week.", "misfit_words": [ "JANUARY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n5", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6246583461761475, "eval_count": 31, "ok": true, "parsed": { "verdict": "no", "reason": "NIGHT is not a body part.", "misfit_words": [ "NIGHT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n6", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6524357795715332, "eval_count": 36, "ok": true, "parsed": { "verdict": "no", "reason": "Bats and mosquitoes are not types of birds.", "misfit_words": [ "BAT", "MOSQUITO" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n7", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.6560320854187012, "eval_count": 35, "ok": true, "parsed": { "verdict": "no", "reason": "Blue is a color, not necessarily a thing that is red.", "misfit_words": [ "BLUE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n8", "type": "JUDGE", "model": "gemma4:latest", "dt": 0.7655882835388184, "eval_count": 48, "ok": true, "parsed": { "verdict": "no", "reason": "Only SHOULDER, FRONT, and PIZZA fit common phrases with COLD, but MOUNTAIN does not.", "misfit_words": [ "MOUNTAIN" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "create-e1", "type": "CREATE", "model": "gemma4:latest", "dt": 0.5947470664978027, "eval_count": 30, "ok": true, "parsed": { "words": [ "Oak", "Maple", "Pine", "Birch" ], "reason": "These are all common types of trees." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e2", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6220695972442627, "eval_count": 32, "ok": true, "parsed": { "words": [ "Alpha", "Beta", "Gamma", "Delta" ], "reason": "These are the first four letters of the Greek alphabet." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e3", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6435914039611816, "eval_count": 35, "ok": true, "parsed": { "words": [ "furious", "irate", "enraged", "mad" ], "reason": "All four words are direct synonyms for feeling intense anger." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e4", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6068227291107178, "eval_count": 30, "ok": true, "parsed": { "words": [ "Monday", "Tuesday", "Wednesday", "Thursday" ], "reason": "These are four consecutive days of the week." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m1", "type": "CREATE", "model": "gemma4:latest", "dt": 0.7005162239074707, "eval_count": 41, "ok": true, "parsed": { "words": [ "BLIZZARD", "HEAT", "STORM", "SNOW" ], "reason": "These words can all precede 'STORM' to form a type of storm." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m2", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6505529880523682, "eval_count": 37, "ok": true, "parsed": { "words": [ "tiny", "petite", "minuscule", "diminutive" ], "reason": "All four words are direct synonyms for 'small' in size." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m3", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6751258373260498, "eval_count": 37, "ok": true, "parsed": { "words": [ "JEANS", "BIRD", "MOON", "BLOOD" ], "reason": "All words can follow 'BLUE' to form common phrases." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m4", "type": "CREATE", "model": "gemma4:latest", "dt": 0.6219143867492676, "eval_count": 32, "ok": true, "parsed": { "words": [ "heart", "promise", "record", "bone" ], "reason": "All four items can be metaphorically or literally broken." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-h1", "type": "CREATE", "model": "gemma4:latest", "dt": 0.7682175636291504, "eval_count": 49, "ok": true, "parsed": { "words": [ "SEA", "SEE", "HEAR", "HERE" ], "reason": "All four words are homophones of common body parts (sea/see for 'C' sound, hear/here for 'hear')." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-h2", "type": "CREATE", "model": "gemma4:latest", "dt": 0.7266902923583984, "eval_count": 42, "ok": true, "parsed": { "words": [ "KNEEHIGH", "EARDRUM", "STOMACHACHE", "EYELASH" ], "reason": "Each word contains a body part name as a substring." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "ca-y1", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6249268054962158, "eval_count": 31, "ok": true, "parsed": { "valid": "yes", "reason": "All four words can be used as verbs meaning to go up.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y2", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.5928308963775635, "eval_count": 27, "ok": true, "parsed": { "valid": "yes", "reason": "All four listed items are types of fruit.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y3", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6987485885620117, "eval_count": 40, "ok": true, "parsed": { "valid": "yes", "reason": "You can crack a nut, a code, a smile, or a whip (like a whip crack).", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y4", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6028926372528076, "eval_count": 27, "ok": true, "parsed": { "valid": "yes", "reason": "All four items are commonly associated with baseball.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y5", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6008837223052979, "eval_count": 29, "ok": true, "parsed": { "valid": "yes", "reason": "All four are names of Roman deities associated with planets.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n1", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.728452205657959, "eval_count": 43, "ok": true, "parsed": { "valid": "no", "reason": "These words are types of trees, not furniture brands.", "weak_words": [ "OAK", "MAPLE", "BIRCH", "PINE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n2", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.621466875076294, "eval_count": 32, "ok": true, "parsed": { "valid": "no", "reason": "These words are Greek letters, not inherently meaning 'small'.", "weak_words": [ "ALPHA" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n3", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6523487567901611, "eval_count": 33, "ok": true, "parsed": { "valid": "no", "reason": "BAT and BALL are not things worn on the hand.", "weak_words": [ "BAT", "BALL" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n4", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.7682509422302246, "eval_count": 48, "ok": true, "parsed": { "valid": "no", "reason": "All the words listed are days of the week, not months of the year.", "weak_words": [ "MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n5", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6511921882629395, "eval_count": 34, "ok": true, "parsed": { "valid": "no", "reason": "Nut and smile are not typically considered kitchen items.", "weak_words": [ "NUT", "SMILE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-b1", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.6162059307098389, "eval_count": 30, "ok": true, "parsed": { "valid": "no", "reason": "None of these words are recognized tech or phone brands.", "weak_words": [ "APPLE" ] }, "raw": null, "error": null, "score": "BORDERLINE" }, { "case_id": "ca-b2", "type": "CREATIVE_ACCEPT", "model": "gemma4:latest", "dt": 0.8338274955749512, "eval_count": 56, "ok": true, "parsed": { "valid": "no", "reason": "Only SHARP and FLAT are common musical terms, while KEY and NATURAL are also musical terms, making 'Real estate listing terms' a poor fit for all four.", "weak_words": [ "KEY", "NATURAL" ] }, "raw": null, "error": null, "score": "BORDERLINE" }, { "case_id": "judge-y1", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7991511821746826, "eval_count": 38, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are specific, well-known types of trees.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y2", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7396526336669922, "eval_count": 38, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are standard names of letters in the Greek alphabet.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y3", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7360649108886719, "eval_count": 37, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are specific names of days in the week.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y4", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7296037673950195, "eval_count": 37, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are direct synonyms representing different degrees of happiness.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y5", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.9206283092498779, "eval_count": 60, "ok": true, "parsed": { "verdict": "no", "reason": "While Green, Ice, and Fire House are common, 'Court House' is typically written as two words (Courthouse) or refers to a different semantic structure.", "misfit_words": [ "COURT" ] }, "raw": null, "error": null, "score": "FAIL" }, { "case_id": "judge-y6", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.9253664016723633, "eval_count": 50, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words form common phrases when preceded by 'cold': cold shoulder, cold front, cold snap, and cold turkey.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y7", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7268784046173096, "eval_count": 36, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are synonyms for moving at high speed.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y8", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7770085334777832, "eval_count": 42, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are direct synonyms for a mental concept, notion, or vague understanding.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n1", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7453572750091553, "eval_count": 40, "ok": true, "parsed": { "verdict": "no", "reason": "Carrot is a root vegetable, not a type of tree.", "misfit_words": [ "CARROT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n2", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7958042621612549, "eval_count": 44, "ok": true, "parsed": { "verdict": "no", "reason": "CYRILLIC refers to a script, not a specific Greek letter.", "misfit_words": [ "CYRILLIC" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n3", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7643091678619385, "eval_count": 42, "ok": true, "parsed": { "verdict": "no", "reason": "The word 'SAD' is an antonym rather than a synonym.", "misfit_words": [ "SAD" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n4", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7373373508453369, "eval_count": 40, "ok": true, "parsed": { "verdict": "no", "reason": "JANUARY is a month, not a day of the week.", "misfit_words": [ "JANUARY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n5", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7540380954742432, "eval_count": 39, "ok": true, "parsed": { "verdict": "no", "reason": "The word 'NIGHT' is not a body part.", "misfit_words": [ "NIGHT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n6", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7899594306945801, "eval_count": 46, "ok": true, "parsed": { "verdict": "no", "reason": "BAT and MOSQUITO are mammals and insects, respectively, not birds.", "misfit_words": [ "BAT", "MOSQUITO" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n7", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.7485904693603516, "eval_count": 39, "ok": true, "parsed": { "verdict": "no", "reason": "BLUE and GRASS are not typically red.", "misfit_words": [ "BLUE", "GRASS" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n8", "type": "JUDGE", "model": "gemma4:26b", "dt": 0.9349522590637207, "eval_count": 63, "ok": true, "parsed": { "verdict": "no", "reason": "While shoulder, front, and mountain can follow 'cold', 'cold pizza' is a description of temperature rather than a common idiomatic phrase or compound noun like the others.", "misfit_words": [ "PIZZA" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "create-e1", "type": "CREATE", "model": "gemma4:26b", "dt": 0.6970863342285156, "eval_count": 34, "ok": true, "parsed": { "words": [ "Oak", "Maple", "Birch", "Cedar" ], "reason": "These are all common species of trees." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e2", "type": "CREATE", "model": "gemma4:26b", "dt": 0.7034692764282227, "eval_count": 37, "ok": true, "parsed": { "words": [ "ALPHA", "BETA", "GAMMA", "DELTA" ], "reason": "These are the first four letters of the Greek alphabet." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e3", "type": "CREATE", "model": "gemma4:26b", "dt": 0.723792314529419, "eval_count": 40, "ok": true, "parsed": { "words": [ "Irate", "Livid", "Furious", "Incensed" ], "reason": "All four words are direct synonyms for being extremely angry." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e4", "type": "CREATE", "model": "gemma4:26b", "dt": 0.6912398338317871, "eval_count": 34, "ok": true, "parsed": { "words": [ "Monday", "Tuesday", "Wednesday", "Thursday" ], "reason": "These are all consecutive days of the week." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m1", "type": "CREATE", "model": "gemma4:26b", "dt": 0.7368042469024658, "eval_count": 42, "ok": true, "parsed": { "words": [ "BRAIN", "DUST", "FIRE", "SNOW" ], "reason": "Each word forms a common compound word when followed by 'STORM'." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m2", "type": "CREATE", "model": "gemma4:26b", "dt": 0.7377326488494873, "eval_count": 42, "ok": true, "parsed": { "words": [ "tiny", "microscopic", "diminutive", "minuscule" ], "reason": "All four words are direct synonyms for something of very small size." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m3", "type": "CREATE", "model": "gemma4:26b", "dt": 0.7627155780792236, "eval_count": 43, "ok": true, "parsed": { "words": [ "BIRD", "JAY", "PRINT", "WHALE" ], "reason": "Each word can be preceded by 'blue' to form a common term." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m4", "type": "CREATE", "model": "gemma4:26b", "dt": 0.7079043388366699, "eval_count": 37, "ok": true, "parsed": { "words": [ "Promise", "Record", "Heart", "Law" ], "reason": "All four are common objects or concepts that can be broken." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-h1", "type": "CREATE", "model": "gemma4:26b", "dt": 0.8588354587554932, "eval_count": 53, "ok": true, "parsed": { "words": [ "MUSSEL", "HARE", "EYE", "KNEAD" ], "reason": "Each word is a homophone of a body part (muscle, hair, eye, knee) but spelled differently." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-h2", "type": "CREATE", "model": "gemma4:26b", "dt": 0.8054819107055664, "eval_count": 48, "ok": true, "parsed": { "words": [ "HANDSOME", "FOOTPRINT", "EARNEST", "ARMORY" ], "reason": "Each word contains a body part (hand, foot, ear, arm) as a substring." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "ca-y1", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7489900588989258, "eval_count": 39, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are synonyms or closely related verbs describing the act of rising or ascending.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y2", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7018086910247803, "eval_count": 31, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are common types of fruit.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y3", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7803692817687988, "eval_count": 41, "ok": true, "parsed": { "valid": "yes", "reason": "You can crack a whip, crack a nut, crack a code, and crack a smile.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y4", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7275149822235107, "eval_count": 37, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are standard pieces of equipment used in the game of baseball.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y5", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7079963684082031, "eval_count": 34, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are names of prominent deities in Roman mythology.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n1", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.8165183067321777, "eval_count": 47, "ok": true, "parsed": { "valid": "no", "reason": "These words represent types of trees rather than furniture brands.", "weak_words": [ "OAK", "MAPLE", "BIRCH", "PINE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n2", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.9430453777313232, "eval_count": 60, "ok": true, "parsed": { "valid": "no", "reason": "Alpha, beta, gamma, and delta are Greek letters used to denote various things, but none of them inherently mean 'small'.", "weak_words": [ "ALPHA", "BETA", "GAMMA", "DELTA" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n3", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7568156719207764, "eval_count": 37, "ok": true, "parsed": { "valid": "no", "reason": "Bat and ball are not items worn on the hand.", "weak_words": [ "BAT", "BALL" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n4", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.882706880569458, "eval_count": 54, "ok": true, "parsed": { "valid": "no", "reason": "None of the words are months of the year; they are all days of the week.", "weak_words": [ "MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n5", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.8334591388702393, "eval_count": 48, "ok": true, "parsed": { "valid": "no", "reason": "While whip and nut can relate to food, code and smile have no plausible connection to kitchen items.", "weak_words": [ "CODE", "SMILE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-b1", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.7428381443023682, "eval_count": 38, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are names of well-known technology companies or mobile device brands.", "weak_words": [] }, "raw": null, "error": null, "score": "BORDERLINE" }, { "case_id": "ca-b2", "type": "CREATIVE_ACCEPT", "model": "gemma4:26b", "dt": 0.9244155883789062, "eval_count": 58, "ok": true, "parsed": { "valid": "no", "reason": "While 'flat' and 'key' can relate to real estate, 'sharp' and 'natural' have no plausible connection to real estate listings.", "weak_words": [ "SHARP", "NATURAL" ] }, "raw": null, "error": null, "score": "BORDERLINE" }, { "case_id": "judge-y1", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.0965914726257324, "eval_count": 34, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are common types of trees.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y2", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.0503978729248047, "eval_count": 38, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are the first four letters of the Greek alphabet.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y3", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 1.9456124305725098, "eval_count": 35, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are standard days of the week.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y4", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.055443048477173, "eval_count": 38, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are direct synonyms for the state of being happy.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y5", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.074954032897949, "eval_count": 39, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words form common compound words or phrases with 'house'.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y6", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1484782695770264, "eval_count": 41, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words form common compound words or phrases when preceded by 'cold'.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y7", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 1.980708122253418, "eval_count": 35, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are common synonyms for moving rapidly.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-y8", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1886539459228516, "eval_count": 42, "ok": true, "parsed": { "verdict": "yes", "reason": "All four words are common synonyms for a mental representation or a suggestion of an idea.", "misfit_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n1", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.060978412628174, "eval_count": 38, "ok": true, "parsed": { "verdict": "no", "reason": "Carrot is a root vegetable, not a tree.", "misfit_words": [ "CARROT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n2", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.2179007530212402, "eval_count": 43, "ok": true, "parsed": { "verdict": "no", "reason": "Cyrillic is a writing system, not a Greek letter.", "misfit_words": [ "CYRILLIC" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n3", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1611642837524414, "eval_count": 40, "ok": true, "parsed": { "verdict": "no", "reason": "SAD is an antonym of happy, not a synonym.", "misfit_words": [ "SAD" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n4", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.117088794708252, "eval_count": 40, "ok": true, "parsed": { "verdict": "no", "reason": "JANUARY is a month, not a day of the week.", "misfit_words": [ "JANUARY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n5", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1064138412475586, "eval_count": 40, "ok": true, "parsed": { "verdict": "no", "reason": "NIGHT is a time of day, not a body part.", "misfit_words": [ "NIGHT" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n6", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.0780858993530273, "eval_count": 38, "ok": true, "parsed": { "verdict": "no", "reason": "Bat and mosquito are not birds.", "misfit_words": [ "BAT", "MOSQUITO" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n7", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.004055976867676, "eval_count": 37, "ok": true, "parsed": { "verdict": "no", "reason": "Blue and grass are not red.", "misfit_words": [ "BLUE", "GRASS" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "judge-n8", "type": "JUDGE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.7377030849456787, "eval_count": 57, "ok": true, "parsed": { "verdict": "no", "reason": "Cold pizza and cold mountain are not established compound words or common idioms like cold shoulder and cold front.", "misfit_words": [ "PIZZA", "MOUNTAIN" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "create-e1", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 1.9321458339691162, "eval_count": 34, "ok": true, "parsed": { "words": [ "Oak", "Maple", "Pine", "Birch" ], "reason": "All four are common types of trees." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e2", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 1.9048969745635986, "eval_count": 36, "ok": true, "parsed": { "words": [ "Alpha", "Beta", "Gamma", "Delta" ], "reason": "These are the first four letters of the Greek alphabet." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e3", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.0525240898132324, "eval_count": 40, "ok": true, "parsed": { "words": [ "Furious", "Irate", "Livid", "Enraged" ], "reason": "All four words are strong synonyms for being extremely angry." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-e4", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 1.7998781204223633, "eval_count": 33, "ok": true, "parsed": { "words": [ "Monday", "Wednesday", "Friday", "Sunday" ], "reason": "They are all days of the week." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m1", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.073843479156494, "eval_count": 40, "ok": true, "parsed": { "words": [ "BRAIN", "FIRE", "THUNDER", "SNOW" ], "reason": "All four words form common compound words when followed by STORM." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m2", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.000436782836914, "eval_count": 38, "ok": true, "parsed": { "words": [ "tiny", "petite", "slight", "miniature" ], "reason": "All four words are direct synonyms for small in size." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m3", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1925113201141357, "eval_count": 43, "ok": true, "parsed": { "words": [ "BERRY", "JAY", "MOON", "PRINT" ], "reason": "All four words follow 'BLUE' to form common compound words or phrases." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-m4", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.063403606414795, "eval_count": 38, "ok": true, "parsed": { "words": [ "Record", "Promise", "Silence", "Heart" ], "reason": "All four are common abstract or physical things that can be broken." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "create-h1", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 6.69955849647522, "eval_count": 170, "ok": false, "parsed": null, "raw": "{\"words\": [\"HARE\", \"MUSSEL\", \"CEREAL\", \"STEEL\"], \"reason\": \"They are homophones of hair, muscle, serial (not a body part) - wait.\"}\n\n*(Self-correction: Cereal/Serial is not a body part. Let me refine the list: Hare/Hair, Mussel/Muscle, Nose/Knows is not it, Eye/I is not it. Let's use: Hare, Mussel, Hue/Hew is not it. Let's try: Hare, Mussel, Waist/Waste, Sole/Soul.)*\n\n{\"words\": [\"HARE\", \"MUSSEL\", \"WASTE\", \"SOUL\"], \"reason\": \"They are homophones of hair, muscle, waist, and sole.\"}", "error": "JSONDecodeError('Extra data: line 3 column 1 (char 133)')", "score": "PARSE_FAIL" }, { "case_id": "create-h2", "type": "CREATE", "model": "gemma4:31b-it-q4_K_M", "dt": 2.2020654678344727, "eval_count": 42, "ok": true, "parsed": { "words": [ "HEARTBEAT", "HANDSOME", "FOOTPRINT", "ARMCHAIR" ], "reason": "Each word contains a body part as a substring." }, "raw": null, "error": null, "score": "MANUAL" }, { "case_id": "ca-y1", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.137995719909668, "eval_count": 34, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are synonyms for the action of moving upward.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y2", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 1.8336284160614014, "eval_count": 31, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are common types of fruit.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y3", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.1915175914764404, "eval_count": 40, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are common objects or concepts that can be 'cracked'.", "weak_words": [ "WHIP" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y4", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 1.9880671501159668, "eval_count": 34, "ok": true, "parsed": { "valid": "yes", "reason": "All four items are standard pieces of equipment used in baseball.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-y5", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 1.996727705001831, "eval_count": 35, "ok": true, "parsed": { "valid": "yes", "reason": "All four words are names of Roman deities as well as planets.", "weak_words": [] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n1", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.716524839401245, "eval_count": 54, "ok": true, "parsed": { "valid": "no", "reason": "These are types of trees/woods, not brands of furniture.", "weak_words": [ "OAK", "MAPLE", "BIRCH", "PINE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n2", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.5698249340057373, "eval_count": 48, "ok": true, "parsed": { "valid": "no", "reason": "These are Greek letters used as designations, not synonyms for 'small'.", "weak_words": [ "ALPHA", "BETA", "GAMMA", "DELTA" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n3", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.312160015106201, "eval_count": 43, "ok": true, "parsed": { "valid": "no", "reason": "A bat and a ball are not worn on the hand.", "weak_words": [ "BAT", "BALL" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n4", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.6066768169403076, "eval_count": 50, "ok": true, "parsed": { "valid": "no", "reason": "The words are days of the week, not months of the year.", "weak_words": [ "MONDAY", "FRIDAY", "SUNDAY", "WEDNESDAY" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-n5", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.3842716217041016, "eval_count": 45, "ok": true, "parsed": { "valid": "no", "reason": "Code and smile are not items typically found in a kitchen.", "weak_words": [ "CODE", "SMILE" ] }, "raw": null, "error": null, "score": "PASS" }, { "case_id": "ca-b1", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.347870349884033, "eval_count": 44, "ok": true, "parsed": { "valid": "no", "reason": "Orange and Kiwi are not recognized tech or phone brands.", "weak_words": [ "ORANGE", "KIWI" ] }, "raw": null, "error": null, "score": "BORDERLINE" }, { "case_id": "ca-b2", "type": "CREATIVE_ACCEPT", "model": "gemma4:31b-it-q4_K_M", "dt": 2.40256667137146, "eval_count": 46, "ok": true, "parsed": { "valid": "no", "reason": "Natural and Sharp are not standard terms used in real estate listings.", "weak_words": [ "NATURAL", "SHARP" ] }, "raw": null, "error": null, "score": "BORDERLINE" } ]