#!/usr/bin/env python
# coding: utf-8
# To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!
#
#

#

#

Join Discord if you need help + ⭐
Star us on Github ⭐
#
#
# To install Unsloth on your local device, follow [our guide](https://unsloth.ai/docs/get-started/install). This notebook is licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).
#
# You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & how to save it
# # ### Installation
#
# # In[ ]:
#
#
# get_ipython().run_cell_magic('capture', '', 'import os, re\nif "COLAB_" not in "".join(os.environ.keys()):\n !pip install unsloth # Do this in local & cloud setups\nelse:\n import torch; v = re.match(r\'[\\d]{1,}\\.[\\d]{1,}\', str(torch.__version__)).group(0)\n xformers = \'xformers==\' + {\'2.10\':\'0.0.34\',\'2.9\':\'0.0.33.post1\',\'2.8\':\'0.0.32.post2\'}.get(v, "0.0.34")\n !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer\n !pip install --no-deps unsloth_zoo bitsandbytes accelerate {xformers} peft trl triton unsloth\n!pip install --no-deps transformers==5.5.0\n!pip install torchcodec\nimport torch; torch._dynamo.config.recompile_limit = 64;\n')
#
#
# # In[ ]:
#
#
# #@title Colab Extra Install { display-mode: "form" }
# get_ipython().run_line_magic('%capture', '')
# import os
# get_ipython().system('pip install --upgrade -qqq uv')
# if "COLAB_" not in "".join(os.environ.keys()):
# # If you're not in Colab, just use pip install!
# get_ipython().system('pip install unsloth vllm')
# else:
# try: import numpy, PIL; _numpy = f'numpy=={numpy.__version__}'; _pil = f'pillow=={PIL.__version__}'
# except: _numpy = "numpy"; _pil = "pillow"
# try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
# except: is_t4 = False
# _vllm, _triton = ('vllm==0.9.2', 'triton==3.2.0') if is_t4 else ('vllm==0.15.1', 'triton')
# get_ipython().system('uv pip install -qqq --upgrade {_vllm} {_numpy} {_pil} torchvision bitsandbytes xformers unsloth')
# get_ipython().system('uv pip install -qqq {_triton}')
# get_ipython().system('uv pip install transformers==4.56.2')
# get_ipython().system('uv pip install --no-deps trl==0.22.2')
#
#
# # ### Unsloth
# # Goal: Make faster kernels with Reinforcement Learning
#
# Our goal is to make a faster matrix multiplication kernel by doing RL on Gemma 4 with Unsloth.
#
#
#
# You will learn how to:
# 1. Counteract **reward hacking** like cheating, caching, laziness.
# 2. Timing and correctness of kernels and time limits.
# 3. Making good **reward functions**
# 4. How to seriously do RL to make optimized kernels
# In[ ]:
from unsloth import FastVisionModel
import torch
max_seq_length = 4096 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower
gemma4_models = [
# Gemma-4 instruct models:
"unsloth/gemma-4-E2B-it",
"unsloth/gemma-4-E4B-it",
"unsloth/gemma-4-31B-it",
"unsloth/gemma-4-26B-A4B-it",
# Gemma-4 base models:
"unsloth/gemma-4-E2B",
"unsloth/gemma-4-E4B",
"unsloth/gemma-4-31B",
"unsloth/gemma-4-26B-A4B",
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastVisionModel.from_pretrained(
model_name = "unsloth/gemma-4-E2B-it",
max_seq_length = max_seq_length,
load_in_4bit = False, # False for LoRA 16bit
fast_inference = False, # Enable vllm fast inference
)
# We now add some small amount of LoRA weights to Gemma 4 so we only need to train those, instead of training on the full model.
# In[ ]:
model = FastVisionModel.get_peft_model(
model,
r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha = lora_rank*2, # *2 speeds up training
use_gradient_checkpointing = "unsloth", # Reduces memory usage
random_state = 3407,
)
# # Optimized matrix multiplication
#
# Numpy has optimized matrix multiplication kernels for CPUs via BLAS optimized operations. For GPUs, one can use CUDA accelerated cuBLAS kernels which PyTorch calls under the hood.
#
# To generate some random matrices to do matrix multiplication, we can do the below:
# In[ ]:
import numpy as np
def generate_random_matrices(seed = 3407, n = 256):
random_state = np.random.RandomState(seed)
n, k, m = random_state.randint(1, n+1, size = 3)
A = np.random.uniform(-10, 10, size = (n, k))
B = np.random.uniform(-10, 10, size = (k, m))
return A, A.tolist(), B, B.tolist()
# We shall generate a small matrix, and see the matrix multiplied output
# In[ ]:
A, A_list, B, B_list = generate_random_matrices(seed = 42, n = 5)
print(A)
print(B)
print(np.matmul(A, B))
# We can call a LLM to generate a simple matrix multiply kernel in Python only, and we can calculate the differences between the actual result and the kernel's result
# In[ ]:
def calculate_difference(pred, real):
if pred is None: return 5, 5
assert real is not None
import numpy as np
try:
difference = pred - real
except:
return 5, 5
amax_error = float(np.amax(difference))
mse_error = float(np.mean(np.square(difference)))
return amax_error, mse_error
# In[ ]:
# Kernel generated by GPT-5
def matmul(A, B):
z, s = zip, sum
Bt = list(z(*B))
return [[s(a*b for a, b in z(row, col)) for col in Bt] for row in A]
# We see the error below is very small, so that's good!
# In[ ]:
prediction = matmul(A_list, B_list)
calculate_difference(prediction, np.matmul(A, B))
# # Countering Reward Hacking
#
# The ultimate goal of RL is to maximize some reward (say speed, revenue, some metric).
#
# But RL can **cheat** When the RL algorithm learns a trick or exploits something to increase the reward, without actually doing the task at end, this is called "Reward Hacking".
#
# Some good examples are in https://en.wikipedia.org/wiki/Reward_hacking
#
# For matrix multiplication kernels, we might see the following issues:
#
# * Laziness: RL learns to use Numpy, Torch, other libraries, which calls optimized kernels.
# * Caching: RL learns to cache the result of the output
# * Cheating: RL learns to find the actual output by inspecting Python global variables
# * RL learns to edit the timing function to make it output 0 time as passed.
#
# And possibly more. We shall try to address each!
# # Countering Reward Hacking 1: Stop laziness
# We can stop the RL algorithm from calling optimized code by inspecting if the generated code imports other non standard Python libraries. We used GPT-5 to help generate this check `check_only_stdlib_imports`:
# In[ ]:
#@title (Collapsible code)
import ast
import sys
import sysconfig
from pathlib import Path
def _stdlib_names():
"""
Build a set of canonical stdlib top-level module/package names.
Uses sys.stdlib_module_names when available (3.10+), with a
filesystem fallback for older versions/edge cases.
"""
names = {m.lower() for m in getattr(sys, "stdlib_module_names", set())}
names |= {m.lower() for m in sys.builtin_module_names}
names.add("__future__") # special-case
# Fallback/augmentation: scan the stdlib directory
try:
stdlib_dir = Path(sysconfig.get_path("stdlib"))
if stdlib_dir.exists():
for p in stdlib_dir.iterdir():
if p.name == "site-packages":
continue
if p.suffix == ".py":
names.add(p.stem.lower())
elif p.is_dir() and (p / "__init__.py").exists():
names.add(p.name.lower())
except Exception:
# conservative fallback; the names set above will still work well
pass
return names
_STDLIB_SET = _stdlib_names()
def check_only_stdlib_imports(code: str):
"""
Return (ok: bool, details: dict)
ok == True -> all absolute imports are from the stdlib.
ok == False -> details['non_stdlib'] lists offending top-level modules.
details includes:
- stdlib: sorted list of stdlib imports found
- non_stdlib: sorted list of non-stdlib imports found
- relative_imports: count of relative imports (always allowed here)
"""
try:
tree = ast.parse(code)
except SyntaxError as e:
return False, {
"error": f"SyntaxError: {e}",
"stdlib": [],
"non_stdlib": [],
"relative_imports": 0,
}
abs_imports = set()
relative_count = 0
class Visitor(ast.NodeVisitor):
def visit_Import(self, node: ast.Import):
for alias in node.names:
abs_imports.add(alias.name.split(".")[0])
def visit_ImportFrom(self, node: ast.ImportFrom):
nonlocal relative_count
if (node.level or 0) > 0:
# relative import
relative_count += 1
else:
if node.module:
abs_imports.add(node.module.split(".")[0])
Visitor().visit(tree)
stdlib_found = sorted(m for m in abs_imports if m.lower() in _STDLIB_SET)
non_stdlib = sorted(m for m in abs_imports if m.lower() not in _STDLIB_SET)
return len(non_stdlib) == 0, {
"stdlib": stdlib_found,
"non_stdlib": non_stdlib,
"relative_imports": relative_count,
}
# For example, let's call `check_only_stdlib_imports` on a random piece of matrix multiplication code generated by GPT-5:
# In[ ]:
sample = """
def matmul(A, B):
import numpy as np
from torch import matmul
z, s = zip, sum
Bt = list(z(*B))
return [[s(a*b for a, b in z(row, col)) for col in Bt] for row in A]
"""
ok, info = check_only_stdlib_imports(sample)
print("Only stdlib imports?", ok)
print(info)
# # Countering Reward Hacking 2: Stop cheating
# We can stop the RL algorithm from using global or cached variables by restricting it's `locals` and `globals`.
#
# We are also going to use `exec` to create the function, so we have to save the output to an empty dict.
#
# We also disallow global variable access.
# In[ ]:
output_function = {}
exec(sample, {}, output_function)
output_function["matmul"]
# We also disallow global variable access via `types.FunctionType(f.__code__, {})`
# In[ ]:
import types
output_function["matmul"] = types.FunctionType(output_function["matmul"].__code__, {})
def import_numpy():
np.matmul
print("Success")
import_numpy()
import_numpy = types.FunctionType(import_numpy.__code__, {})
try:
import_numpy()
except Exception as e:
print(str(e))
# In[ ]:
def create_locked_down_function(function):
output_function = {}
exec(function, {}, output_function)
new_matmul = output_function["matmul"]
new_matmul = types.FunctionType(new_matmul.__code__, {})
return new_matmul
# # Countering Reward Hacking 3: Stop caching
# We can stop the RL algorithm from using cached data by wiping the cache with a large fake matrix. We also have to benchmark carefully with multiple loops and turns.
#
# We also add a **timer** to not make the algorithm go in an endless loop.
# In[ ]:
import os, gc, time, statistics
import signal
from contextlib import contextmanager
class TimeoutError(Exception): pass
@contextmanager
def time_limit(seconds):
def _handler(signum, frame):
raise TimeoutError(f"Timed out after {seconds}s")
old = signal.signal(signal.SIGALRM, _handler)
signal.setitimer(signal.ITIMER_REAL, seconds)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0.0)
signal.signal(signal.SIGALRM, old)
class Benchmarker:
def __init__(self, trials = 3, loops = 1, timeout = 30):
self.buffer = np.zeros(2 * 1024 * 1024 * 1024, dtype = np.uint8)
self.trials = trials
self.loops = loops
assert timeout > 0 # Cannot be 0 since it won't work!
self.timeout = timeout
def thrash(self):
# Edit the buffer to wipe cache lines
self.buffer ^= 1
return int(self.buffer[::4096].sum())
def benchmark(self, function, arguments):
assert len(arguments) == self.loops
samples = []
exceptions = []
timed_out = 0
for _ in range(self.trials):
gc.collect(); gc.disable(); self.thrash()
t_start = time.perf_counter_ns()
for i in range(self.loops):
try:
with time_limit(self.timeout):
function(*arguments[i])
except TimeoutError as e:
timed_out += 1
except Exception as e:
exceptions.append(str(e))
t_end = time.perf_counter_ns()
gc.enable()
samples.append((t_end - t_start) // max(1, self.loops))
return {
"median_ns": int(statistics.median(samples)),
"mean_ns": int(statistics.fmean(samples)),
"stdev_ns": int(statistics.pstdev(samples) if len(samples) > 1 else 0),
"exceptions" : exceptions,
"timeouts" : timed_out,
}
# For example we use our matmul kernel we had, and benchmark it with a 10 second delay:
# In[ ]:
A, A_list, B, B_list = generate_random_matrices(seed = 0, n = 256)
Benchmarker(trials = 1, timeout = 10).benchmark(output_function["matmul"], [(A_list, B_list)])
# # Data & RL task setup
#
# We now have to create a prompt to the model for which it will do some task. For our matrix multiply example, we use the below:
# In[ ]:
prompt = """
Create a new fast matrix multiplication function using only native Python code.
You are given a list of list of numbers.
Output your new function in backticks using the format below:
```python
def matmul(A, B):
return ...
```
""".strip()
print(prompt)
# First, let's prompt Gemma 4 without RL and see how it goes:
# In[ ]:
text = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt.strip()}],
tokenize = False,
add_generation_prompt = True,
)
from transformers import TextStreamer
print("=" * 50)
print("BASE MODEL OUTPUT (before RL training):")
print("=" * 50)
inputs = tokenizer(
text = text,
add_special_tokens = False,
return_tensors = "pt",
).to("cuda")
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
result = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 512,
use_cache = True, temperature = 1.0, top_p = 0.95, top_k = 64)
# # Reward functions
#
# We now design the `extract_function` function which simply extracts the function wrapped in 3 backticks.
#
# And 4 reward functions:
#
# 1. `function_works` which rewards the model if the strategy is a valid Python function.
# 2. `no_cheating` which checks if the function imported other modules, and if it did, we penalize it.
# 3. `correctness_check` which checks if the kernel was correct or wrong - it shouldn't generate gibberish!
# 4. `speed_check` checks the performance relative to Numpy matmul directly.
# In[ ]:
def extract_function(text):
if text.count("```") >= 2:
first = text.find("```") + 3
second = text.find("```", first)
fx = text[first : second].strip()
fx = fx.removeprefix("python\n")
fx = fx[fx.find("def"):]
if fx.startswith("def matmul(A, B):"): return fx
return None
print(extract_function(prompt))
# Below is our `function_works` reward function which uses Python's `exec` but guarded by not allowing leakage of local and global variables. We can also use `check_only_stdlib_imports` first to check if there are errors before even executing the function:
# In[ ]:
ok, info = check_only_stdlib_imports("def a")
ok, info
# In[ ]:
def function_works(completions, **kwargs):
scores = []
for completion in completions:
score = 0
response = completion[0]["content"]
function = extract_function(response)
print(function)
if function is not None:
ok, info = check_only_stdlib_imports(function)
if function is None or "error" in info:
score = -2.0
else:
try:
new_matmul = create_locked_down_function(function)
score = 1.0
except:
score = -0.5
scores.append(score)
return scores
# `no_cheating` checks if the function cheated since it might have imported Numpy or Torch optimized code.
# In[ ]:
def no_cheating(completions, **kwargs):
scores = []
for completion in completions:
score = 0
response = completion[0]["content"]
function = extract_function(response)
if function is not None:
ok, info = check_only_stdlib_imports(function)
else:
ok = False
scores.append(1.0 if ok else -20.0) # Penalize heavily!
return scores
# Next `correctness_check` checks if the kernel was correct. We want to penalize if the absolute error is larger than 1, and if the mean squared error is somewhat bigger then machine epsilon.
#
# We have to execute the code now!
# In[ ]:
np.finfo(np.float64).eps
# In[ ]:
def correctness_check(completions, **kwargs):
scores = []
# Generate some random matrices of size less than 128
A, A_list, B, B_list = generate_random_matrices(seed = np.random.randint(10000), n = 128)
for completion in completions:
score = 0
response = completion[0]["content"]
function = extract_function(response)
if function is not None:
ok, info = check_only_stdlib_imports(function)
if function is None or "error" in info:
scores.append(0)
continue
try:
new_matmul = create_locked_down_function(function)
except:
scores.append(0)
continue
try:
pred = new_matmul(A_list.copy(), B_list.copy())
except:
# Failed!
scores.append(-2.0)
continue
true = np.matmul(A, B)
amax_error, mse_error = calculate_difference(pred, true)
# Check correctness and score!
machine_epsilon = 100*np.finfo(np.float64).eps
if amax_error >= 3: score = -3.0
elif amax_error >= 2: score = -2.5
elif amax_error >= 1: score = -2.0
elif amax_error >= 0.5: score = -1.0
elif amax_error >= 100*machine_epsilon: score = 0.0
elif amax_error >= machine_epsilon: score = 1.0
else: score = 3.0
if mse_error >= 3: score += -3.0
elif mse_error >= 2: score += -2.5
elif mse_error >= 1: score += -2.0
elif mse_error >= 0.5: score += -1.0
elif mse_error >= 100*machine_epsilon: score += 0.0
elif mse_error >= machine_epsilon: score += 1.0
else: score += 3.0
scores.append(score)
return scores
# Finally our benchmarking function for `speed_check`! We shall limit the timer to 10 seconds and do 3 trials.
# In[ ]:
A, A_list, B, B_list = generate_random_matrices(seed = 0, n = 256)
benchmarker = Benchmarker(trials = 3, timeout = 10)
numpy_results = benchmarker.benchmark(np.matmul, [(A, B)])
numpy_results
# In[ ]:
new_matmul = create_locked_down_function(extract_function(prompt))
new_results = benchmarker.benchmark(new_matmul, [(A_list, B_list)])
new_results
# We can take the difference and do a negative sign for slower ones. If the ratio is less than 1 (ie faster, we shall invert it!)
# In[ ]:
negative = -(new_results["median_ns"] / numpy_results["median_ns"]) / 100
positive = +(numpy_results["median_ns"] / new_results["median_ns"]) / 100
reward = negative if new_results["median_ns"] >= numpy_results["median_ns"] else positive
reward
# In[ ]:
new_results["median_ns"] = 3
numpy_results["median_ns"] = 1000
negative = -(new_results["median_ns"] / numpy_results["median_ns"]) / 100
positive = +(numpy_results["median_ns"] / new_results["median_ns"]) / 100
reward = negative if new_results["median_ns"] >= numpy_results["median_ns"] else positive
reward
# In[ ]:
import gc
def speed_check(completions, **kwargs):
scores = []
# Generate some random matrices of size less than 256
A, A_list, B, B_list = generate_random_matrices(seed = np.random.randint(10000), n = 256)
numpy_results = benchmarker.benchmark(np.matmul, [(A, B)])
for completion in completions:
score = 0
response = completion[0]["content"]
function = extract_function(response)
if function is not None:
ok, info = check_only_stdlib_imports(function)
if function is None or "error" in info:
scores.append(0)
continue
try:
new_matmul = create_locked_down_function(function)
except:
scores.append(0)
continue
new_results = benchmarker.benchmark(new_matmul, [(A_list.copy(), B_list.copy())])
# Get score and clip to -10, 10
negative = -(new_results["median_ns"] / numpy_results["median_ns"]) / 100
positive = +(numpy_results["median_ns"] / new_results["median_ns"]) / 100
score = negative if new_results["median_ns"] >= numpy_results["median_ns"] else positive
if score >= 10: score = 10
if score <= -10: score = -10
scores.append(score)
# Free memory to counteract OOMs
gc.collect()
torch.cuda.empty_cache()
return scores
# We create the dataset which includes a replica of our prompt.
# In[ ]:
from datasets import Dataset
dataset = Dataset.from_list([{"prompt" : [{"role": "user", "content": prompt.strip()}], "answer" : 0}]*1000)
maximum_length = len(tokenizer.apply_chat_template([{"role":"user", "content":prompt.strip()}], add_generation_prompt = True, tokenize = True))
print(maximum_length)
dataset[0]
#
# ### Train the model
#
# Now set up GRPO Trainer and all configurations! We also support GSDP, GAPO, Dr GRPO and more! Go to our docs https://unsloth.ai/docs/ for more info!
# In[ ]:
# Leave room for the prompt (plus 1 token safety margin)
max_completion_length = max_seq_length - (maximum_length + 1)
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
temperature = 1.0,
top_p = 0.95,
top_k = 64,
learning_rate = 5e-5,
weight_decay = 0.001,
warmup_ratio = 0.1,
lr_scheduler_type = "linear",
optim = "adamw_8bit",
logging_steps = 1,
per_device_train_batch_size = 1,
gradient_accumulation_steps = 2, # Increase to 4 for smoother training
num_generations = 2, # Decrease if out of memory
max_completion_length = max_completion_length,
# num_train_epochs = 1, # Set to 1 for a full training run
max_steps = 100,
save_steps = 100,
report_to = "none", # Can use Weights & Biases, TrackIO
output_dir = "outputs",
epsilon = 0.2,
epsilon_high = 0.28, # one sided
delta = 1.5, # two sided
loss_type = 'bnpo',
mask_truncated_completions = True
# For optional training + evaluation
# fp16_full_eval = True,
# per_device_eval_batch_size = 4,
# eval_accumulation_steps = 1,
# eval_strategy = "steps",
# eval_steps = 1,
)
# And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!
#
# You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!
#
# | Step | Training Loss | reward | reward_std | completion_length | kl |
# |------|---------------|-----------|------------|-------------------|----------|
# | 1 | 0.000000 | 0.125000 | 0.000000 | 200.000000 | 0.000000 |
# | 2 | 0.000000 | 0.072375 | 0.248112 | 200.000000 | 0.000000 |
# | 3 | 0.000000 | -0.079000 | 0.163776 | 182.500000 | 0.000005 |
# In[ ]:
# For optional training + evaluation
# new_dataset = dataset.train_test_split(test_size = 0.01)
trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
function_works,
no_cheating,
correctness_check,
speed_check,
],
args = training_args,
train_dataset = dataset,
# For optional training + evaluation
# train_dataset = new_dataset["train"],
# eval_dataset = new_dataset["test"],
)
# And let's train the model!
#
# **NOTE** A T4 free GPU might take 5 minutes for one generation sadly since it's an old GPU - A100 or H100 will be much faster!
# In[ ]:
trainer.train()
# And now with the LoRA we just trained with GRPO - we first save the LoRA first!
# In[ ]:
model.save_pretrained("gemma_4_lora") # Local saving
tokenizer.save_pretrained("gemma_4_lora")
# Verify LoRA is actually trained!
# In[ ]:
from safetensors import safe_open
tensors = {}
with safe_open("grpo_saved_lora/adapter_model.safetensors", framework = "pt") as f:
# Verify both A and B are non zero
for key in f.keys():
tensor = f.get_tensor(key)
n_zeros = (tensor == 0).sum() / tensor.numel()
assert(n_zeros.item() != tensor.numel())
#
# # Inference
# Now let's try the model we just trained!
# In[ ]:
text = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt.strip()}],
tokenize = False,
add_generation_prompt = True,
)
from transformers import TextStreamer
_ = model.generate(
**tokenizer(images = None, text = text, return_tensors = "pt").to("cuda"),
temperature = 1.0, top_p = 0.95, top_k = 64,
max_new_tokens = 1024,
streamer = TextStreamer(tokenizer, skip_prompt = False),
)
#
# ### Saving to float16 for VLLM
#
# We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens. See [our docs](https://unsloth.ai/docs/basics/inference-and-deployment) for more deployment options.
# In[ ]:
# Merge to 16bit
if False: model.save_pretrained_merged("gemma_4_finetune_16bit", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("HF_USERNAME/gemma_4_finetune_16bit", tokenizer, save_method = "merged_16bit", token = "YOUR_HF_TOKEN")
# Merge to 4bit
if False: model.save_pretrained_merged("gemma_4_finetune_4bit", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("HF_USERNAME/gemma_4_finetune_4bit", tokenizer, save_method = "merged_4bit", token = "YOUR_HF_TOKEN")
# Just LoRA adapters
if False:
model.save_pretrained("gemma_4_lora")
tokenizer.save_pretrained("gemma_4_lora")
if False:
model.push_to_hub("HF_USERNAME/gemma_4_lora", token = "YOUR_HF_TOKEN")
tokenizer.push_to_hub("HF_USERNAME/gemma_4_lora", token = "YOUR_HF_TOKEN")
# ### GGUF / llama.cpp Conversion
# To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.
#
# Some supported quant methods (full list on our [docs page](https://unsloth.ai/docs/basics/inference-and-deployment/saving-to-gguf)):
# * `q8_0` - Fast conversion. High resource use, but generally acceptable.
# * `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
# * `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.
#
# [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# In[ ]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("gemma_4_finetune", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("HF_USERNAME/gemma_4_finetune", tokenizer, token = "YOUR_HF_TOKEN")
# Save to 16bit GGUF
if False: model.save_pretrained_gguf("gemma_4_finetune", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("HF_USERNAME/gemma_4_finetune", tokenizer, quantization_method = "f16", token = "YOUR_HF_TOKEN")
# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("gemma_4_finetune", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("HF_USERNAME/gemma_4_finetune", tokenizer, quantization_method = "q4_k_m", token = "YOUR_HF_TOKEN")
# Save to multiple GGUF options - much faster if you want multiple!
if False:
model.push_to_hub_gguf(
"HF_USERNAME/gemma_4_finetune", # Change hf to your username!
tokenizer,
quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
token = "YOUR_HF_TOKEN",
)
# Now, use the `gemma_4_finetune.Q8_0.gguf` file or `gemma_4_finetune.Q4_K_M.gguf` file in llama.cpp.
#
# And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
#
# Some other resources:
# 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
# 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
# 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
# 4. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://unsloth.ai/docs/get-started/unsloth-notebooks)!
#
#
#

#

#

#
# Join Discord if you need help + ⭐️
Star us on Github ⭐️
#
#
# This notebook and all Unsloth notebooks are licensed [LGPL-3.0](https://github.com/unslothai/notebooks?tab=LGPL-3.0-1-ov-file#readme).