Qwen3-0.6B quantized generation on Emily
Mix.install(
[
{:emily, "~> 0.4"},
{:bumblebee, "~> 0.7"},
{:tokenizers, "~> 0.5"},
{:nx, "~> 0.12"},
{:axon, "~> 0.8"},
{:kino, "~> 0.14"}
],
config: [
nx: [default_backend: Emily.Backend]
]
)
Overview
This notebook loads Qwen/Qwen3-0.6B through Bumblebee, quantizes
every dense layer’s kernel to int4 via MLX affine group-wise
quantization, and greedy-decodes a completion. It also demonstrates
Emily.Stream for concurrent serving on a shared model.
The checkpoint is ~1.5 GB on first fetch. Budget several minutes for the cold run.
Dense baseline
{:ok, %{model: model, params: params, spec: spec}} =
Bumblebee.load_model({:hf, "Qwen/Qwen3-0.6B"})
{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-0.6B"})
{:ok, generation_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-0.6B"})
model_info = %{model: model, params: params, spec: spec}
config =
Bumblebee.configure(generation_config,
max_new_tokens: 32,
strategy: %{type: :greedy_search}
)
serving =
Bumblebee.Text.generation(model_info, tokenizer, config,
defn_options: [compiler: Emily.Compiler]
)
%{results: [%{text: dense_text}]} =
Nx.Serving.run(serving, "The quick brown fox jumps over the lazy dog.")
dense_text
Quantization transform
Rewriting the Axon graph so every :dense node runs through
Emily.Quantization.Layers.quantized_dense/4 needs Axon, which is a
test-only dep on the Emily package — Emily itself stays Axon-free so
library consumers don’t pull it transitively. The transform below is
the recommended starting point: copy it into your own project
alongside an Axon dep, and extend it as you need (a per-layer
:except filter, stricter opts validation, different group sizes
per layer).
Two steps:
-
Graph rewrite —
Axon.rewrite_nodes/2replaces each:densenode with a:quantized_densesub-graph built withAxon.layer/3. -
Parameter quantization — after init (or Bumblebee load), walk
the
Axon.ModelStateand swap each dense kernel for a%QuantizedWeight{}.
defmodule DenseTransform do
alias Emily.Quantization.Layers
alias Emily.QuantizedWeight
@default_opts [bits: 4, group_size: 128, transpose: true]
def quantize(model, model_state, opts \\ []) do
opts = Keyword.merge(@default_opts, opts)
{rewrite_graph(model), quantize_state(model, model_state, opts)}
end
# Replace every :dense node with a :quantized_dense layer whose
# forward pass dispatches through Emily.Quantization.Layers.quantized_dense/4.
defp rewrite_graph(model) do
Axon.rewrite_nodes(model, fn
%Axon.Node{op: :dense, meta: meta, name: name_fn} ->
fn [x], _output ->
quantized_dense_layer(x, meta[:units],
use_bias: meta[:use_bias],
name: name_fn
)
end
_ ->
:skip
end)
end
defp quantized_dense_layer(x, units, opts) do
# Axon 0.8 dropped `Axon.Shape.dense_kernel/dense_bias`; derive
# the shapes inline from the input — `{batch, …, in_features}` →
# kernel `{in_features, units}`, bias `{units}`.
kernel_shape = fn input_shape ->
in_features = elem(input_shape, tuple_size(input_shape) - 1)
{in_features, units}
end
bias_shape = fn _input_shape -> {units} end
kernel = Axon.param("kernel", kernel_shape, initializer: :glorot_uniform)
{inputs, op} =
if opts[:use_bias] do
bias = Axon.param("bias", bias_shape, initializer: :zeros)
{[x, kernel, bias], &Layers.quantized_dense/4}
else
{[x, kernel], &Layers.quantized_dense/3}
end
Axon.layer(op, inputs,
name: opts[:name],
meta: %{units: units, use_bias: opts[:use_bias]},
op_name: :quantized_dense
)
end
# Walk the ModelState, replacing each dense kernel tensor with a
# %QuantizedWeight{}. For the default transpose: true (AWQ / MLX
# convention, groups along the reduction axis) we pre-transpose the
# [in, out] kernel to [out, in] before calling from_dense/2.
defp quantize_state(model, state, opts) do
transpose = opts[:transpose]
dense_names =
model
|> Axon.properties()
|> Enum.filter(fn {_name, op} -> op == :dense end)
|> Enum.map(fn {name, _} -> name end)
Enum.reduce(dense_names, state, fn name, acc ->
update_in(acc, [Access.key!(:data), name, "kernel"], fn kernel ->
source = if transpose, do: Nx.transpose(kernel), else: kernel
QuantizedWeight.from_dense(source,
group_size: opts[:group_size],
bits: opts[:bits],
transpose: transpose
)
end)
end)
end
end
> Notes. The transform only rewrites top-level :dense nodes; a
> model with dense layers nested inside other Axon ops needs a
> recursive rewriter. The transpose: true default stores weights as
> [out, in] (MLX / AWQ convention, groups along the reduction axis);
> set false if you’re feeding a checkpoint that’s already laid out
> the other way. Bits must be one of [2, 4, 8] — the defn-native
> dequantize_defn/1 path doesn’t cover {3, 6}.
Quantized inference
{qmodel, qparams} =
DenseTransform.quantize(model, params,
bits: 4,
group_size: 128,
transpose: true
)
qmodel_info = %{model: qmodel, params: qparams, spec: spec}
qserving =
Bumblebee.Text.generation(qmodel_info, tokenizer, config,
defn_options: [compiler: Emily.Compiler]
)
%{results: [%{text: quant_text}]} =
Nx.Serving.run(qserving, "The quick brown fox jumps over the lazy dog.")
quant_text
The quantized output will drift from the dense baseline — int4 noise
across every linear is expected. The test at
test/emily/conformance/qwen3_quant_full_test.exs pins a
deterministic reference string for regression-testing the
quantization stack.
Concurrent serving via Emily.Stream
For concurrent inference on a shared model, each serving worker
should own its own MLX command queue. Emily.Stream.with_stream/2
does that per-process:
stream = Emily.Stream.new(:gpu)
task1 =
Task.async(fn ->
Emily.Stream.with_stream(stream, fn ->
Nx.Serving.run(qserving, "Question 1?")
end)
end)
task2 =
Task.async(fn ->
Emily.Stream.with_stream(Emily.Stream.new(:gpu), fn ->
Nx.Serving.run(qserving, "Question 2?")
end)
end)
{Task.await(task1, :infinity), Task.await(task2, :infinity)}
Each Emily.Stream maps to its own Metal command queue. Weights are
shared across streams — no duplication — so the memory cost of adding
a stream is the Metal command buffer, not the model.
Create streams once at worker init, not per-request.