Qwen3-VL Vision-Language Inference
# Livebook setup - copy this entire cell to run
Mix.install([
{:pythonx, "~> 0.4.7"},
{:jason, "~> 1.4.4"},
{:req, "~> 0.5.0"},
{:opentelemetry_api, "~> 1.3"},
{:opentelemetry, "~> 1.3"},
{:opentelemetry_exporter, "~> 1.0"},
])
# Configure OpenTelemetry for console logging
Application.put_env(:opentelemetry, :span_processor, :batch)
Application.put_env(:opentelemetry, :traces_exporter, :none)
Application.put_env(:opentelemetry, :metrics_exporter, :none)
Application.put_env(:opentelemetry, :logs_exporter, :none)
Logger.configure(level: :info)
Setup Python Environment
# Initialize Python environment with Qwen3-VL dependencies
Pythonx.uv_init("""
[project]
name = "qwen3vl-inference"
version = "0.0.0"
requires-python = "==3.10.*"
dependencies = [
"transformers",
"accelerate",
"pillow",
"torch>=2.0.0,<2.5.0",
"torchvision>=0.15.0,<0.20.0",
"numpy",
"huggingface-hub",
"bitsandbytes",
]
[tool.uv.sources]
torch = { index = "pytorch-cu118" }
torchvision = { index = "pytorch-cu118" }
[[tool.uv.index]]
name = "pytorch-cu118"
url = "https://download.pytorch.org/whl/cu118"
explicit = true
""")
IO.puts("✓ Python environment initialized with Qwen3-VL dependencies")
Configuration
# Configure the inference parameters
config = %{
image_path: "path/to/your/image.jpg", # Replace with actual image path
prompt: "What is in this image?",
max_tokens: 4096,
temperature: 0.7,
top_p: 0.9,
output_path: nil, # Set to save to file
use_flash_attention: false,
use_4bit: true
}
IO.puts("Configuration:")
IO.inspect(config, pretty: true)
Model Download (Optional)
# Download model weights (optional - model will be downloaded automatically if not present)
# This cell is optional as the model will be downloaded automatically during inference
# Uncomment to pre-download:
# model_weights_dir = Path.join([File.cwd!(), "pretrained_weights", "Huihui-Qwen3-VL-4B-Instruct-abliterated"])
# repo_id = "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated"
# IO.puts("Downloading model weights...")
# HuggingFaceDownloader.download_repo(repo_id, model_weights_dir, "Qwen3-VL", true)
Vision-Language Inference
# Add weights directory to config for Python
base_dir = File.cwd!()
config_with_paths = Map.merge(config, %{
model_weights_dir: Path.join([base_dir, "pretrained_weights", "Huihui-Qwen3-VL-4B-Instruct-abliterated"])
})
# Save config to JSON for Python
config_json = Jason.encode!(config_with_paths)
config_file = "/tmp/qwen3vl_config_#{System.system_time(:millisecond)}.json"
File.write!(config_file, config_json)
# Run inference
try do
Pythonx.eval(~S"""
import json
import sys
import os
from pathlib import Path
# Verify PyTorch installation
print("Verifying PyTorch installation...")
try:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"CUDA device: {torch.cuda.get_device_name(0)}")
print("[OK] PyTorch installation verified")
except ImportError as e:
print(f"[ERROR] Failed to import PyTorch: {e}")
raise
from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
# Set CPU thread optimization
cpu_count = os.cpu_count()
half_cpu_count = cpu_count // 2
os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
torch.set_num_threads(half_cpu_count)
# Load configuration
""" <> """
config_file_path = r"#{String.replace(config_file, "\\", "\\\\")}"
with open(config_file_path, 'r', encoding='utf-8') as f:
config = json.load(f)
image_path = config['image_path']
prompt = config['prompt']
max_tokens = config.get('max_tokens', 4096)
temperature = config.get('temperature', 0.7)
top_p = config.get('top_p', 0.9)
output_path = config.get('output_path')
use_flash_attention = config.get('use_flash_attention', False)
use_4bit = config.get('use_4bit', True)
print(f"\\n=== Qwen3-VL Inference ===")
print(f"Image: {image_path}")
print(f"Prompt: {prompt}")
# Initialize model
MODEL_ID = "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32
# Configure quantization
quantization_config = None
if use_4bit and device == "cuda":
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4"
)
dtype = None
# Load model
load_kwargs = {
"device_map": "auto",
"trust_remote_code": True,
"low_cpu_mem_usage": True,
"attn_implementation": "flash_attention_2" if use_flash_attention else "sdpa",
}
if quantization_config:
load_kwargs["quantization_config"] = quantization_config
else:
load_kwargs["dtype"] = dtype
print("Loading model...")
model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, **load_kwargs)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
# Load and process image
print(f"Loading image: {image_path}")
image = Image.open(image_path).convert("RGB")
print(f"Image loaded: {image.size[0]}x{image.size[1]}")
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image_path},
{"type": "text", "text": prompt},
],
}
]
# Process inputs
print("Preparing inputs...")
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
# Generate response
print(f"Generating response (max_tokens={max_tokens}, temperature={temperature})...")
generated_ids = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=temperature > 0.0,
)
# Extract generated tokens
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
# Decode response
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)
response = output_text[0] if output_text else ""
print("\\n=== Response ===")
print(response)
print("\\n=== End Response ===")
# Save output
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(response)
print(f"\\nResponse saved to: {output_path}")
else:
output_dir = Path("output")
output_dir.mkdir(exist_ok=True)
import time
tag = time.strftime("%Y%m%d_%H_%M_%S")
export_dir = output_dir / tag
export_dir.mkdir(exist_ok=True)
output_file = export_dir / f"qwen3vl_response_{tag}.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(response)
print(f"\\nResponse saved to: {output_file}")
print("\\n=== Complete ===")
""", %{})
IO.puts("✓ Inference completed successfully!")
rescue
e ->
IO.puts("❌ Error during inference: #{inspect(e)}")
after
# Cleanup
File.rm(config_file)
end
Usage Instructions
-
Setup: Run the first cell to install dependencies
-
Configure: Update the config map with your image path and prompt
-
Run: Execute the inference cell to generate responses
-
Results: Check the output directory for saved responses
Notes
-
Requires CUDA-compatible GPU for best performance
-
Model will be downloaded automatically on first run (~8GB)
-
Use 4-bit quantization to reduce VRAM requirements
-
Supports various image formats (JPG, PNG, etc.)