eecebe7ef5
Five-lane parallel research pass. Each subdir under tooling/ has its own README indexing downloaded files with verified upstream sources. - google-official/: deepmind-gemma JAX examples, gemma_pytorch scripts, gemma.cpp API server docs, google-gemma/cookbook notebooks, ai.google.dev HTML snapshots, Gemma 3 tech report - huggingface/: 8 gemma-4-* model cards, chat-template .jinja files, tokenizer_config.json, transformers gemma4/ source, launch blog posts, official HF Spaces app.py - inference-frameworks/: vLLM/llama.cpp/MLX/Keras-hub/TGI/Gemini API/Vertex AI comparison, run_commands.sh with 8 working launches, 9 code snippets - gemma-family/: 12 per-variant briefs (ShieldGemma 2, CodeGemma, PaliGemma 2, Recurrent/Data/Med/TxGemma, Embedding/Translate/Function/Dolphin/SignGemma) - fine-tuning/: Unsloth Gemma 4 notebooks, Axolotl YAMLs (incl 26B-A4B MoE), TRL scripts, Google cookbook fine-tune notebooks, recipe-recommendation.md Findings that update earlier CORPUS_* docs are flagged in tooling/README.md (not applied) — notably the new <|turn>/<turn|> prompt format, gemma_pytorch abandonment, gemma.cpp Gemini-API server, transformers AutoModelForMultimodalLM, FA2 head_dim=512 break, 26B-A4B MoE quantization rules, no Gemma 4 tech report PDF yet, no Gemma-4-generation specialized siblings yet. Pre-commit secrets hook bypassed per user authorization — flagged "secrets" are base64 notebook cell outputs and example Ed25519 keys in the HDP agentic-security demo, not real credentials. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
176 lines
8.1 KiB
Python
176 lines
8.1 KiB
Python
@ModelBase.register("Gemma4ForConditionalGeneration")
|
|
class Gemma4Model(Gemma3Model):
|
|
model_arch = gguf.MODEL_ARCH.GEMMA4
|
|
|
|
def norm_shift(self, name: str) -> float:
|
|
del name # unused
|
|
return 0.0
|
|
|
|
def set_vocab(self):
|
|
vocab = gguf.LlamaHfVocab(self.dir_model)
|
|
tokens = []
|
|
scores = []
|
|
toktypes = []
|
|
visible_tokens = {"<|channel>", "<channel|>", "<|tool_call>", "<tool_call|>", "<|tool_response>", "<tool_response|>", "<|\"|>"}
|
|
|
|
for text, score, toktype in vocab.all_tokens():
|
|
tokens.append(text)
|
|
scores.append(score)
|
|
text_str = text.decode()
|
|
if text_str in visible_tokens:
|
|
# always render these tokens, so that the chat parser can read them
|
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
|
logger.info(f"Token '{text_str}' is set to USER_DEFINED")
|
|
else:
|
|
toktypes.append(toktype)
|
|
|
|
assert len(tokens) == vocab.vocab_size
|
|
|
|
self.gguf_writer.add_tokenizer_model("gemma4")
|
|
self.gguf_writer.add_token_list(tokens)
|
|
self.gguf_writer.add_token_scores(scores)
|
|
self.gguf_writer.add_token_types(toktypes)
|
|
|
|
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
|
|
special_vocab.add_to_gguf(self.gguf_writer)
|
|
self.gguf_writer.add_add_space_prefix(False)
|
|
self.gguf_writer.add_add_bos_token(True)
|
|
|
|
def set_gguf_parameters(self):
|
|
super().set_gguf_parameters()
|
|
|
|
num_kv_shared_layers = self.hparams["num_kv_shared_layers"]
|
|
self.gguf_writer.add_shared_kv_layers(num_kv_shared_layers)
|
|
|
|
# per-layer embedding is optional
|
|
n_pl_embd = self.hparams.get("hidden_size_per_layer_input") or 0
|
|
self.gguf_writer.add_embedding_length_per_layer_input(n_pl_embd)
|
|
|
|
swa_layers = [t == "sliding_attention" for t in self.hparams["layer_types"]]
|
|
self.gguf_writer.add_sliding_window_pattern(swa_layers)
|
|
|
|
head_dim_full = self.hparams["global_head_dim"]
|
|
head_dim_swa = self.hparams["head_dim"]
|
|
# correct the head dim for global/swa layers
|
|
self.gguf_writer.add_key_length(head_dim_full)
|
|
self.gguf_writer.add_value_length(head_dim_full)
|
|
self.gguf_writer.add_key_length_swa(head_dim_swa)
|
|
self.gguf_writer.add_value_length_swa(head_dim_swa)
|
|
|
|
expert_intermediate_size = self.find_hparam(["expert_intermediate_size", "moe_intermediate_size"])
|
|
if expert_intermediate_size is not None:
|
|
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
|
|
|
|
# if use_double_wide_mlp is set, we need to adjust the value for kv shared layers
|
|
use_double_wide_mlp = self.hparams.get("use_double_wide_mlp", False)
|
|
first_kv_shared_layer_idx = self.block_count - num_kv_shared_layers
|
|
if use_double_wide_mlp:
|
|
n_ff = self.hparams["intermediate_size"]
|
|
n_ff_arr = [n_ff if il < first_kv_shared_layer_idx else n_ff * 2 for il in range(self.block_count)]
|
|
self.gguf_writer.add_feed_forward_length(n_ff_arr)
|
|
|
|
# handle num_global_key_value_heads
|
|
num_key_value_heads_full = self.hparams.get("num_global_key_value_heads")
|
|
num_key_value_heads_swa = self.hparams.get("num_key_value_heads")
|
|
if num_key_value_heads_full is not None and num_key_value_heads_swa is not None:
|
|
value_arr = [num_key_value_heads_swa if is_swa else num_key_value_heads_full for is_swa in swa_layers]
|
|
self.gguf_writer.add_head_count_kv(value_arr)
|
|
|
|
# handle n_rot differently for global vs swa layers
|
|
partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0)
|
|
n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors
|
|
n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa)
|
|
self.gguf_writer.add_rope_dimension_count(n_rot_full)
|
|
self.gguf_writer.add_rope_dimension_count_swa(n_rot_swa)
|
|
|
|
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
|
|
# full layer uses "proportional" rope with partial_rotary_factor=0.25
|
|
# the expected ordering is cc000000ss000000 (c = cos, s = sin, 0 = unrotated),
|
|
# but ggml neox only supports ccss000000000000, and we cannot rearrange the head because that will break use_alternative_attention
|
|
# solution is to set specific freq_factors for the unrotated dims
|
|
|
|
# IMPORTANT: this ROPE_FREQS tensor is ONLY used by the full_attention layers
|
|
rope_params_full = self.hparams["rope_parameters"]["full_attention"]
|
|
assert rope_params_full["rope_type"] == "proportional"
|
|
head_dim_full = (self.hparams["global_head_dim"])
|
|
partial_rotary_factor_full = rope_params_full["partial_rotary_factor"]
|
|
n_rot_full = int(head_dim_full * partial_rotary_factor_full / 2)
|
|
n_unrot_full = int(head_dim_full / 2) - n_rot_full
|
|
values = [1.0] * n_rot_full + [1e30] * n_unrot_full
|
|
rope_freqs_full = torch.tensor(values, dtype=torch.float32)
|
|
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), rope_freqs_full)
|
|
|
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
if name.endswith("per_dim_scale") or name.endswith("layer_scalar"):
|
|
name = name + ".weight"
|
|
|
|
if "language_model." not in name and "rope_freqs" not in name:
|
|
return # skip non-language model tensors
|
|
|
|
name = name.replace("language_model.", "")
|
|
if name.endswith("router.scale"):
|
|
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, ".scale")
|
|
yield (name, data_torch)
|
|
return
|
|
if ".per_expert_scale" in name:
|
|
# convert per-expert scale to FFN down scale
|
|
name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN_EXP, bid, ".scale")
|
|
yield (name, data_torch)
|
|
return
|
|
if ".experts." in name and not name.endswith(".weight"):
|
|
name += ".weight"
|
|
|
|
yield from super().modify_tensors(data_torch, name, bid)
|
|
|
|
|
|
@ModelBase.register("Gemma4ForConditionalGeneration")
|
|
class Gemma4VisionAudioModel(MmprojModel):
|
|
has_audio_encoder = True
|
|
has_vision_encoder = True
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
assert self.hparams_vision is not None
|
|
self.hparams_vision["image_size"] = 224 # unused, but set to avoid error
|
|
|
|
# remap audio hparams
|
|
if self.hparams_audio:
|
|
self.hparams_audio["feat_in"] = self.hparams_audio.get("input_feat_size", 128)
|
|
self.hparams_audio["intermediate_size"] = self.hparams_audio["hidden_size"] * 4
|
|
else:
|
|
self.has_audio_encoder = False
|
|
|
|
def set_gguf_parameters(self):
|
|
super().set_gguf_parameters()
|
|
|
|
# vision params
|
|
self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA4V)
|
|
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
|
|
|
|
# audio params
|
|
if self.hparams_audio:
|
|
self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA4A)
|
|
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
|
|
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
|
|
|
|
def is_audio_tensor(self, name: str) -> bool:
|
|
return "audio_tower" in name or "embed_audio" in name
|
|
|
|
def tensor_force_quant(self, name, new_name, bid, n_dims):
|
|
if self.is_audio_tensor(name):
|
|
if ".conv" in name or "_conv" in name and ".weight" in name:
|
|
return gguf.GGMLQuantizationType.F32
|
|
if "position_embedding_table" in name:
|
|
return gguf.GGMLQuantizationType.F32
|
|
return super().tensor_force_quant(name, new_name, bid, n_dims)
|
|
|
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
|
del bid # unused
|
|
|
|
if name.startswith("model.language_model."):
|
|
return # skip
|
|
|
|
if len(data_torch.shape) == 0:
|
|
# convert scalar tensors (input/output_mix/max) to 1D tensors
|
|
data_torch = data_torch.unsqueeze(0)
|