Powered by AppSignal & Oban Pro

Hierarchical Weibull Reliability

notebooks/10_reliability_weibull.livemd

Hierarchical Weibull Reliability

Setup

# EMLX (Metal GPU) on macOS Apple Silicon, EXLA (CPU/CUDA) elsewhere
backend_dep =
  case :os.type() do
    {:unix, :darwin} -> {:emlx, "~> 0.2"}
    _ -> {:exla, "~> 0.10"}
  end

Mix.install([
  {:exmc, path: Path.expand("../", __DIR__)},
  backend_dep,
  {:kino_vega_lite, "~> 0.1"}
])

The Reliability Model

Hierarchical Weibull reliability models failure times across component types with partial pooling of shape and scale parameters. This is a common industrial pattern for fleet-level reliability estimation where individual component types have limited data.

This is a d=44 model: 4 hyperparameters + 20 types × 2 NCP params (shape + scale).

Model structure:

  • log_k_mean ~ Normal(0.5, 1.0) — log-shape hyperprior mean
  • log_k_sigma ~ HalfCauchy(1.0) — log-shape spread
  • log_l_mean ~ Normal(2.0, 1.0) — log-scale hyperprior mean
  • log_l_sigma ~ HalfCauchy(1.0) — log-scale spread
  • k_raw_j, l_raw_j ~ Normal(0, 1) for j = 0..19 — NCP shape/scale params
  • k_j = exp(log_k_mean + log_k_sigma * k_raw_j) — Weibull shape for type j
  • lambda_j = exp(log_l_mean + log_l_sigma * l_raw_j) — Weibull scale for type j
  • t_{ij} ~ Weibull(k_j, lambda_j) with 20% right-censored observations
Code.require_file("../benchmark/reliability_data.exs", __DIR__)
Code.require_file("../benchmark/reliability_model.exs", __DIR__)

data = Exmc.Benchmark.ReliabilityData.generate(seed: 42)

IO.puts("Component types: #{data.n_types}")
IO.puts("Observations per type: #{data.n_per_type}")

# Show censoring summary
total_obs = data.n_types * data.n_per_type
total_cens = Enum.sum(Enum.map(data.obs_by_type, & &1.n_cens))
IO.puts("Total observations: #{total_obs} (#{total_cens} right-censored, #{Float.round(100 * total_cens / total_obs, 1)}%)")

Building and Sampling

The model uses the new Exmc.Dist.Weibull distribution with right-censoring support via Exmc.Dist.Censored. The Weibull survival function S(t) = exp(-(t/lambda)^k) gives the log-likelihood contribution for censored observations.

ir = Exmc.Benchmark.ReliabilityModel.build(data)
init = Exmc.Benchmark.ReliabilityModel.init_values(data)

IO.puts("Free parameters: #{map_size(init)}")
t0 = System.monotonic_time(:millisecond)

{trace, stats} =
  Exmc.NUTS.Sampler.sample(ir, init,
    num_warmup: 1000,
    num_samples: 1000,
    seed: 42,
    ncp: false
  )

wall_s = (System.monotonic_time(:millisecond) - t0) / 1000.0
IO.puts("Wall time: #{Float.round(wall_s, 1)}s")
IO.puts("Step size: #{Float.round(stats.step_size, 4)}")
IO.puts("Divergences: #{stats.divergences}")

Hyperprior Posteriors

alias Exmc.Diagnostics

hyperparams = Exmc.Benchmark.ReliabilityModel.reconstruct_params(trace, data)

for {name, tensor} <- hyperparams do
  samples = Nx.to_flat_list(tensor)
  mean = Enum.sum(samples) / length(samples)
  ess = Diagnostics.ess(samples)
  IO.puts("#{name}: mean=#{Float.round(mean, 3)}, ESS=#{Float.round(ess, 0)}")
end

Per-Type Shape and Scale Recovery

# Reconstruct per-type Weibull parameters from NCP trace
type_params =
  Enum.map(0..(data.n_types - 1), fn j ->
    log_k_mean = Nx.to_flat_list(trace["log_k_mean"])
    log_k_sigma = Nx.to_flat_list(trace["log_k_sigma"])
    k_raw = Nx.to_flat_list(trace["k_raw_#{j}"])

    log_l_mean = Nx.to_flat_list(trace["log_l_mean"])
    log_l_sigma = Nx.to_flat_list(trace["log_l_sigma"])
    l_raw = Nx.to_flat_list(trace["l_raw_#{j}"])

    k_samples =
      Enum.zip_with([log_k_mean, log_k_sigma, k_raw], fn [m, s, z] ->
        :math.exp(m + s * z)
      end)

    l_samples =
      Enum.zip_with([log_l_mean, log_l_sigma, l_raw], fn [m, s, z] ->
        :math.exp(m + s * z)
      end)

    k_mean = Enum.sum(k_samples) / length(k_samples)
    l_mean = Enum.sum(l_samples) / length(l_samples)

    %{
      type: j,
      k_post: k_mean,
      l_post: l_mean,
      k_true: Enum.at(data.true_shapes, j),
      l_true: Enum.at(data.true_scales, j)
    }
  end)
alias VegaLite, as: Vl

# Shape recovery plot
Vl.new(width: 500, height: 400, title: "Weibull Shape (k): True vs Posterior")
|> Vl.data_from_values(type_params)
|> Vl.mark(:circle, size: 80, opacity: 0.8)
|> Vl.encode_field(:x, "k_true", type: :quantitative, title: "True k")
|> Vl.encode_field(:y, "k_post", type: :quantitative, title: "Posterior Mean k")
# Scale recovery plot
Vl.new(width: 500, height: 400, title: "Weibull Scale (λ): True vs Posterior")
|> Vl.data_from_values(type_params)
|> Vl.mark(:circle, size: 80, opacity: 0.8, color: "orange")
|> Vl.encode_field(:x, "l_true", type: :quantitative, title: "True λ")
|> Vl.encode_field(:y, "l_post", type: :quantitative, title: "Posterior Mean λ")

Hazard Curves

The hazard function h(t) = (k/lambda) * (t/lambda)^(k-1) gives the instantaneous failure rate. For k > 1, hazard increases (wear-out); for k < 1, it decreases (infant mortality).

# Plot hazard curves for 5 selected component types
selected_types = [0, 4, 9, 14, 19]
t_grid = Enum.map(0..100, &amp;(&amp;1 * 0.3))

hazard_data =
  Enum.flat_map(selected_types, fn j ->
    tp = Enum.at(type_params, j)
    k = tp.k_post
    l = tp.l_post

    Enum.map(t_grid, fn t ->
      t = max(t, 0.01)
      h = (k / l) * :math.pow(t / l, k - 1)
      %{t: t, hazard: min(h, 5.0), type: "Type #{j} (k=#{Float.round(k, 1)})"}
    end)
  end)

Vl.new(width: 600, height: 400, title: "Estimated Hazard Curves by Component Type")
|> Vl.data_from_values(hazard_data)
|> Vl.mark(:line)
|> Vl.encode_field(:x, "t", type: :quantitative, title: "Time")
|> Vl.encode_field(:y, "hazard", type: :quantitative, title: "Hazard Rate h(t)")
|> Vl.encode_field(:color, "type", type: :nominal, title: "Component Type")