Int8-W8A8 Quantized version of Qwen/Qwen3-8B, using SmoothQuant and GPTQ. This is a preliminary version (and subject to change).

Creation script

# Requirements: pip install llmcompressor transformers datasets

from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from llmcompressor import oneshot from llmcompressor.modifiers.smoothquant import SmoothQuantModifier from llmcompressor.modifiers.quantization import GPTQModifier

MODEL_ID = "Qwen/Qwen3-8B" SAVE_PATH = "./Qwen3-8B-W8A8-INT8" NUM_SAMPLES = 1024 MAX_SEQ_LEN = 8192

Load model and tokenizer

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Calibration data

ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train") ds = ds.shuffle(seed=42).select(range(NUM_SAMPLES)) ds = ds.map( lambda x: { "text": tokenizer.apply_chat_template( x["messages"], tokenize=False, add_generation_prompt=False ) } ) ds = ds.map( lambda x: tokenizer( x["text"], padding=False, max_length=MAX_SEQ_LEN, truncation=True, add_special_tokens=False, ), remove_columns=ds.column_names, )

Recipe

recipe = [ SmoothQuantModifier(smoothing_strength=0.5), GPTQModifier( targets="Linear", scheme="W8A8", ignore=["lm_head"], dampening_frac=0.05, actorder="weight", block_size=64, ), ]

Quantize

oneshot( model=model, dataset=ds, recipe=recipe, max_seq_length=MAX_SEQ_LEN, num_calibration_samples=NUM_SAMPLES, )

Save

model.save_pretrained(SAVE_PATH, save_compressed=True) tokenizer.save_pretrained(SAVE_PATH)

Wikitext Evals

lm_eval --model=vllm --model_args pretrained=Qwen/Qwen3-8B,add_bos_token=true,dtype=auto,max_model_len=8192,gpu_memory_utilization=0.8 --tasks wikitext --batch_size=64

Tasks	Version	Filter	Metric		Value		Stderr
wikitext	2	none	bits_per_byte	↓	0.6765	±	N/A
		none	byte_perplexity	↓	1.5983	±	N/A
		none	word_perplexity	↓	12.2761	±	N/A

lm_eval --model=vllm --model_args pretrained=nm-testing/Qwen3-8B-W8A8-INT8,add_bos_token=true,dtype=auto,max_model_len=8192,gpu_memory_utilization=0.8 -
-tasks wikitext --batch_size=64