Powered by AppSignal & Oban Pro

Qwen3-VL Vision-Language Inference

elixir/qwen3vl_inference.livemd

Qwen3-VL Vision-Language Inference

# Livebook setup - copy this entire cell to run
Mix.install([
  {:pythonx, "~> 0.4.7"},
  {:jason, "~> 1.4.4"},
  {:req, "~> 0.5.0"},
  {:opentelemetry_api, "~> 1.3"},
  {:opentelemetry, "~> 1.3"},
  {:opentelemetry_exporter, "~> 1.0"},
])

# Configure OpenTelemetry for console logging
Application.put_env(:opentelemetry, :span_processor, :batch)
Application.put_env(:opentelemetry, :traces_exporter, :none)
Application.put_env(:opentelemetry, :metrics_exporter, :none)
Application.put_env(:opentelemetry, :logs_exporter, :none)

Logger.configure(level: :info)

Setup Python Environment

# Initialize Python environment with Qwen3-VL dependencies
Pythonx.uv_init("""
[project]
name = "qwen3vl-inference"
version = "0.0.0"
requires-python = "==3.10.*"
dependencies = [
  "transformers",
  "accelerate",
  "pillow",
  "torch>=2.0.0,<2.5.0",
  "torchvision>=0.15.0,<0.20.0",
  "numpy",
  "huggingface-hub",
  "bitsandbytes",
]

[tool.uv.sources]
torch = { index = "pytorch-cu118" }
torchvision = { index = "pytorch-cu118" }

[[tool.uv.index]]
name = "pytorch-cu118"
url = "https://download.pytorch.org/whl/cu118"
explicit = true
""")

IO.puts("✓ Python environment initialized with Qwen3-VL dependencies")

Configuration

# Configure the inference parameters
config = %{
  image_path: "path/to/your/image.jpg",  # Replace with actual image path
  prompt: "What is in this image?",
  max_tokens: 4096,
  temperature: 0.7,
  top_p: 0.9,
  output_path: nil,  # Set to save to file
  use_flash_attention: false,
  use_4bit: true
}

IO.puts("Configuration:")
IO.inspect(config, pretty: true)

Model Download (Optional)

# Download model weights (optional - model will be downloaded automatically if not present)
# This cell is optional as the model will be downloaded automatically during inference

# Uncomment to pre-download:
# model_weights_dir = Path.join([File.cwd!(), "pretrained_weights", "Huihui-Qwen3-VL-4B-Instruct-abliterated"])
# repo_id = "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated"
# IO.puts("Downloading model weights...")
# HuggingFaceDownloader.download_repo(repo_id, model_weights_dir, "Qwen3-VL", true)

Vision-Language Inference

# Add weights directory to config for Python
base_dir = File.cwd!()
config_with_paths = Map.merge(config, %{
  model_weights_dir: Path.join([base_dir, "pretrained_weights", "Huihui-Qwen3-VL-4B-Instruct-abliterated"])
})

# Save config to JSON for Python
config_json = Jason.encode!(config_with_paths)
config_file = "/tmp/qwen3vl_config_#{System.system_time(:millisecond)}.json"
File.write!(config_file, config_json)

# Run inference
try do
  Pythonx.eval(~S"""
import json
import sys
import os
from pathlib import Path

# Verify PyTorch installation
print("Verifying PyTorch installation...")
try:
    import torch
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print("[OK] PyTorch installation verified")
except ImportError as e:
    print(f"[ERROR] Failed to import PyTorch: {e}")
    raise

from PIL import Image
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig

# Set CPU thread optimization
cpu_count = os.cpu_count()
half_cpu_count = cpu_count // 2
os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
torch.set_num_threads(half_cpu_count)

# Load configuration
""" <> """
config_file_path = r"#{String.replace(config_file, "\\", "\\\\")}"
with open(config_file_path, 'r', encoding='utf-8') as f:
    config = json.load(f)

image_path = config['image_path']
prompt = config['prompt']
max_tokens = config.get('max_tokens', 4096)
temperature = config.get('temperature', 0.7)
top_p = config.get('top_p', 0.9)
output_path = config.get('output_path')
use_flash_attention = config.get('use_flash_attention', False)
use_4bit = config.get('use_4bit', True)

print(f"\\n=== Qwen3-VL Inference ===")
print(f"Image: {image_path}")
print(f"Prompt: {prompt}")

# Initialize model
MODEL_ID = "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32

# Configure quantization
quantization_config = None
if use_4bit and device == "cuda":
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )
    dtype = None

# Load model
load_kwargs = {
    "device_map": "auto",
    "trust_remote_code": True,
    "low_cpu_mem_usage": True,
    "attn_implementation": "flash_attention_2" if use_flash_attention else "sdpa",
}

if quantization_config:
    load_kwargs["quantization_config"] = quantization_config
else:
    load_kwargs["dtype"] = dtype

print("Loading model...")
model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, **load_kwargs)
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load and process image
print(f"Loading image: {image_path}")
image = Image.open(image_path).convert("RGB")
print(f"Image loaded: {image.size[0]}x{image.size[1]}")

# Prepare messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image_path},
            {"type": "text", "text": prompt},
        ],
    }
]

# Process inputs
print("Preparing inputs...")
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

# Generate response
print(f"Generating response (max_tokens={max_tokens}, temperature={temperature})...")
generated_ids = model.generate(
    **inputs,
    max_new_tokens=max_tokens,
    temperature=temperature,
    top_p=top_p,
    do_sample=temperature > 0.0,
)

# Extract generated tokens
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# Decode response
output_text = processor.batch_decode(
    generated_ids_trimmed,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

response = output_text[0] if output_text else ""

print("\\n=== Response ===")
print(response)
print("\\n=== End Response ===")

# Save output
if output_path:
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(response)
    print(f"\\nResponse saved to: {output_path}")
else:
    output_dir = Path("output")
    output_dir.mkdir(exist_ok=True)
    import time
    tag = time.strftime("%Y%m%d_%H_%M_%S")
    export_dir = output_dir / tag
    export_dir.mkdir(exist_ok=True)
    output_file = export_dir / f"qwen3vl_response_{tag}.txt"
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(response)
    print(f"\\nResponse saved to: {output_file}")

print("\\n=== Complete ===")
""", %{})

  IO.puts("✓ Inference completed successfully!")

rescue
  e ->
    IO.puts("❌ Error during inference: #{inspect(e)}")
after
  # Cleanup
  File.rm(config_file)
end

Usage Instructions

  1. Setup: Run the first cell to install dependencies
  2. Configure: Update the config map with your image path and prompt
  3. Run: Execute the inference cell to generate responses
  4. Results: Check the output directory for saved responses

Notes

  • Requires CUDA-compatible GPU for best performance
  • Model will be downloaded automatically on first run (~8GB)
  • Use 4-bit quantization to reduce VRAM requirements
  • Supports various image formats (JPG, PNG, etc.)