Text Generation
Transformers
mistral3
image-text-to-text
neuralmagic
redhat
llmcompressor
quantized
FP4
conversational
compressed-tensors
Instructions to use RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForImageTextToText processor = AutoProcessor.from_pretrained("RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4") model = AutoModelForImageTextToText.from_pretrained("RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4
- SGLang
How to use RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4 with Docker Model Runner:
docker model run hf.co/RedHatAI/Ministral-3-14B-Instruct-2512-NVFP4
| # coding=utf-8 | |
| # Copyright 2025 HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import argparse | |
| import gc | |
| import json | |
| import os | |
| import re | |
| import torch | |
| from safetensors.torch import save_file | |
| from safetensors.torch import safe_open | |
| from huggingface_hub import snapshot_download | |
| from transformers import Mistral3Config, Mistral3ForConditionalGeneration | |
| # fmt: off | |
| STATE_DICT_MAPPING = { | |
| r"^language_model\.lm_head": r"output", | |
| r"^language_model\.model\.norm": r"norm", | |
| r"^language_model\.model\.embed_tokens": r"tok_embeddings", | |
| r"^language_model\.model\.layers\.(\d+)\.input_layernorm": r"layers.\1.attention_norm", | |
| r"^language_model\.model\.layers\.(\d+)\.post_attention_layernorm": r"layers.\1.ffn_norm", | |
| r"^language_model\.model\.layers\.(\d+)\.self_attn\.(q|k|v|o)_proj": r"layers.\1.attention.w\2", | |
| r"^language_model\.model\.layers\.(\d+)\.mlp\.gate_proj": r"layers.\1.feed_forward.w1", | |
| r"^language_model\.model\.layers\.(\d+)\.mlp\.down_proj": r"layers.\1.feed_forward.w2", | |
| r"^language_model\.model\.layers\.(\d+)\.mlp\.up_proj": r"layers.\1.feed_forward.w3", | |
| r"multi_modal_projector.patch_merger.merging_layer.weight": r"patch_merger.merging_layer.weight", | |
| r"multi_modal_projector.norm.weight": r"pre_mm_projector_norm.weight", | |
| r"multi_modal_projector.linear_1.weight": r"vision_language_adapter.w_in.weight", | |
| r"multi_modal_projector.linear_2.weight": r"vision_language_adapter.w_out.weight", | |
| r"vision_tower.ln_pre.weight": r"vision_encoder.ln_pre.weight", | |
| r"vision_tower.patch_conv.weight": r"vision_encoder.patch_conv.weight", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.attention_norm": r"vision_encoder.transformer.layers.\1.attention_norm", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.ffn_norm": r"vision_encoder.transformer.layers.\1.ffn_norm", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.attention\.(q|k|v|o)_proj": r"vision_encoder.transformer.layers.\1.attention.w\2", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.gate_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w1", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.down_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w2", | |
| r"^vision_tower\.transformer\.layers\.(\d+)\.feed_forward\.up_proj": r"vision_encoder.transformer.layers.\1.feed_forward.w3", | |
| } | |
| # fmt: on | |
| IGNORE_STATE_DICT_MAPPING = { | |
| r"^model\.language_model": r"language_model.model", | |
| } | |
| SKIP_KEYS = [ ] | |
| def add_quantization_config(config, hf_config: Mistral3ForConditionalGeneration): | |
| quantization_config = hf_config.hf_quantizer.quantization_config | |
| mistral_ignore = [] # keys to ignore in the quantization config | |
| for hf_key in quantization_config.quantization_config.ignore: | |
| mistral_key = map_hf_key_to_mistral(hf_key, state_dict_mapping = IGNORE_STATE_DICT_MAPPING) | |
| mistral_ignore.append(mistral_key) | |
| quantization_config.quantization_config.ignore = mistral_ignore | |
| quant_config_dict = quantization_config.to_dict() | |
| quant_config_dict['config_groups']['group_0']['input_activations'].pop('scale_dtype') | |
| quant_config_dict['config_groups']['group_0']['input_activations'].pop('zp_dtype') | |
| quant_config_dict['config_groups']['group_0']['weights'].pop('scale_dtype') | |
| quant_config_dict['config_groups']['group_0']['weights'].pop('zp_dtype') | |
| config["quantization_config"] = quant_config_dict | |
| return config | |
| def map_hf_key_to_mistral(hf_key, state_dict_mapping = STATE_DICT_MAPPING): | |
| """Map a key from HF format to Mistral format""" | |
| for pattern, replacement in state_dict_mapping.items(): | |
| new_key, n_replace = re.subn(pattern, replacement, hf_key) | |
| if n_replace > 0: | |
| return new_key.replace("weight_scale", "qscale_weight") | |
| # If no mapping found, return the original key | |
| return hf_key.replace("weight_scale", "qscale_weight") | |
| def permute_for_mistral_rope(tensor, n_heads, dim1, dim2): | |
| """Reverse the ROPE permutation to get back to Mistral format.""" | |
| old_tensor = tensor | |
| tensor = tensor.view(n_heads, 2, dim1 // n_heads // 2, dim2) | |
| tensor = tensor.transpose(1, 2) | |
| tensor = tensor.reshape(dim1, dim2) | |
| return tensor | |
| def convert_state_dict(hf_state_dict, config): | |
| """Convert HF Ministral state dict to Mistral format""" | |
| mistral_dict = {} | |
| text_config = config["text_config"] | |
| vision_config = config["vision_config"] | |
| text_num_attention_heads = text_config["num_attention_heads"] | |
| text_hidden_size = text_config["hidden_size"] | |
| text_head_dim = text_config["head_dim"] | |
| text_num_key_value_heads = text_config["num_key_value_heads"] | |
| text_key_value_dim = text_head_dim * text_num_key_value_heads | |
| text_query_dim = text_head_dim * text_num_attention_heads | |
| vision_num_attention_heads = vision_config["num_attention_heads"] | |
| vision_hidden_size = vision_config["hidden_size"] | |
| vision_head_dim = vision_config["head_dim"] | |
| vision_num_key_value_heads = vision_num_attention_heads | |
| vision_key_value_dim = vision_head_dim * vision_num_key_value_heads | |
| vision_query_dim = vision_head_dim * vision_num_attention_heads | |
| for hf_key, tensor in hf_state_dict.items(): | |
| if hf_key in SKIP_KEYS: | |
| continue | |
| mistral_key = map_hf_key_to_mistral(hf_key) | |
| if "language_model" in hf_key: | |
| if hf_key.endswith("q_proj.weight"): | |
| tensor = permute_for_mistral_rope(tensor, text_num_attention_heads, text_query_dim, text_hidden_size) | |
| elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == text_num_attention_heads: | |
| tensor = permute_for_mistral_rope(tensor, text_num_attention_heads, text_query_dim, 1) | |
| elif hf_key.endswith("k_proj.weight"): | |
| tensor = permute_for_mistral_rope(tensor, text_num_key_value_heads, text_key_value_dim, text_hidden_size) | |
| elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == text_num_key_value_heads: | |
| tensor = permute_for_mistral_rope(tensor, text_num_key_value_heads, text_key_value_dim, 1) | |
| if "vision_tower" in hf_key: | |
| if hf_key.endswith("q_proj.weight"): | |
| tensor = permute_for_mistral_rope(tensor, vision_num_attention_heads, vision_query_dim, vision_hidden_size) | |
| elif hf_key.endswith("q_proj.weight_scale") and tensor.size(0) == vision_num_attention_heads: | |
| tensor = permute_for_mistral_rope(tensor, vision_num_attention_heads, vision_query_dim, 1) | |
| elif hf_key.endswith("k_proj.weight"): | |
| tensor = permute_for_mistral_rope(tensor, vision_num_key_value_heads, vision_key_value_dim, vision_hidden_size) | |
| elif hf_key.endswith("k_proj.weight_scale") and tensor.size(0) == vision_num_key_value_heads: | |
| tensor = permute_for_mistral_rope(tensor, vision_num_key_value_heads, vision_key_value_dim, 1) | |
| mistral_dict[mistral_key] = tensor | |
| return mistral_dict | |
| def write_model( | |
| input_path_or_repo, | |
| output_dir, | |
| unquantized_model_path=None, | |
| ): | |
| print("Converting HF Ministral model to Mistral format.") | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Load the HF Ministral model | |
| print(f"Loading HF Ministral model from {input_path_or_repo}...") | |
| hf_config = Mistral3ForConditionalGeneration.from_pretrained(input_path_or_repo) | |
| if os.path.exists(input_path_or_repo): | |
| local_path = input_path_or_repo | |
| else: | |
| local_path = snapshot_download(input_path_or_repo) | |
| # Convert config | |
| if unquantized_model_path is not None: | |
| if os.path.exists(unquantized_model_path): | |
| unquantized_model_path = unquantized_model_path | |
| else: | |
| unquantized_model_path = snapshot_download(unquantized_model_path) | |
| config_path = os.path.join(unquantized_model_path, "params.json") | |
| with open(config_path, "r") as f: | |
| config = json.load(f) | |
| config = add_quantization_config(config, hf_config) | |
| with open(os.path.join(output_dir, "params.json"), "w") as f: | |
| json.dump(config, f, indent=2) | |
| else: | |
| raise ValueError(f"Unquantized model config not found for {unquantized_model_path}") | |
| # Convert state dict | |
| print("Converting state dict...") | |
| tensor_files = sorted([f for f in os.listdir(os.path.join(local_path)) if f.endswith(".safetensors")]) | |
| hf_state_dict = {} | |
| for file in tensor_files: | |
| file_path = os.path.join(local_path, file) | |
| with safe_open(file_path, framework="pt", device="cuda") as f: | |
| for key in f.keys(): | |
| hf_state_dict[key] = f.get_tensor(key) | |
| mistral_config = Mistral3Config().to_dict() | |
| mistral_state_dict = convert_state_dict(hf_state_dict, mistral_config) | |
| # save the state dict | |
| save_file(mistral_state_dict, os.path.join(output_dir, "consolidated.safetensors")) | |
| del hf_state_dict, mistral_state_dict | |
| gc.collect() | |
| print("Model converted successfully.") | |
| def write_tokenizer(input_path_or_repo: str, output_dir: str): | |
| """Extract and save the tokenizer from Ministral model""" | |
| from transformers import MistralCommonBackend | |
| print("Extracting tokenizer...") | |
| tokenizer = MistralCommonBackend.from_pretrained(input_path_or_repo) | |
| tokenizer.save_pretrained(output_dir) | |
| print("Tokenizer saved successfully.") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Convert HF Ministral weights to Mistral format") | |
| parser.add_argument( | |
| "--input_path_or_repo", | |
| type=str, | |
| default="Ministral-3-14B-Instruct-2512-QUANTIZED", | |
| help="Path or repo containing HF Ministral model", | |
| ) | |
| parser.add_argument( | |
| "--output_dir", | |
| type=str, | |
| default="Ministral-3-14B-Instruct-2512-QUANTIZED-CONVERTED", | |
| help="Location to write Mistral model and tokenizer", | |
| ) | |
| parser.add_argument( | |
| "--skip_tokenizer", | |
| action="store_true", | |
| help="Skip tokenizer conversion" | |
| ) | |
| parser.add_argument( | |
| "--unquantized_model_path", | |
| type=str, | |
| default="mistralai/Ministral-3-14B-Instruct-2512-BF16", | |
| help="Path to the unquantized model", | |
| ) | |
| args = parser.parse_args() | |
| write_model( | |
| args.input_path_or_repo, | |
| args.output_dir, | |
| unquantized_model_path=args.unquantized_model_path, | |
| ) | |
| if not args.skip_tokenizer: | |
| write_tokenizer( | |
| args.input_path_or_repo, | |
| args.output_dir, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |