Generate Elixir HumanEval Problems
Mix.install([
{:kino, "~> 0.11.0"},
{:jason, "~> 1.4"},
{:openai_ex, "~> 0.4.2"}
])
apikey = System.fetch_env!("LB_OPENAI_API_KEY")
openai = OpenaiEx.new(apikey)
Scratchpad
Prep
Goals
- We want to be able to evaluate the performance of LLMs for code generation
- Ability to semi-automatically translate existing benchmarks such as human-eval, human-eval-infilling, MBPP, MBXP, DS-1000, HumanEval-X, HumanEval-multilingual and others into Elixir
- Evaluate several state of the art models’ Elixir performance
- Ability to add our own Elixir evaluation tasks in future
- Evaluation needs to be fast so we can run it occasionally during training to see if a training run is going in the right direction
In this notebook we’ll do the following
- Define a new data format for Elixir evaluation tasks that is a superset of existing evaluation benchmarks (so we can translate from a wide range of benchmarks). Each task consists of
-
id
- A human-readable unique ID -
code_context_mix_setup
- AMix.install
block- Important to test if models are dependency-aware and for later evaluating models that are e.g. fine-tuned on common livebook workflows.
- Separated out so we can install all dependencies at once if we like
-
code_context_before
- code before our implementation, generally consists of- Helper functions that can be used in the solution (if any - mostly empty)
- A module definition
- The docstring
- The function head
- The beginning of the implementation
-
code_context_after
- Any code after the implementation - only used for infilling tasks -
canonical_solution
- The canonical solution -
surface_level_conditions
- Will in future be used to test how somethign was implemented (e.g. to check whether a pipe was used or not). -
tests
- Tests to confirm the solution works as intended -
description
- Field for human readable description or comments about specific tasks - the LLMs won’t see this
-
Convert existing benchmarks into this new format and save as
.jsonl
files -
Produce an Evaluator module that can check generated solution samples against the tests
-
Load sample solutions from
.jsonl
file (can have many samples for each task ID) -
Evaluate samples against tests and return pass-rate for each task ID, as well as overall pass@k metric for various values of
k
defmodule ElixirEval.Task do
defstruct [
:id,
:description,
:code_context_mix_setup,
:code_context_before,
:code_context_after,
:tests,
:surface_level_conditions,
:canonical_solution
]
end
defmodule ElixirEval.Sample do
defstruct [
:task_id,
:sample_id,
:completion
]
end
defmodule ElixirEval do
def save_tasks!(tasks, path) do
File.write!(
path,
tasks |> Enum.map(&Map.from_struct/1) |> Enum.map(&Jason.encode!/1) |> Enum.join("\n")
)
end
def load_tasks!(path) do
path
|> File.stream!()
|> Stream.map(&Jason.decode!(&1, keys: :atoms))
|> Stream.map(&struct(ElixirEval.Task, &1))
|> Enum.to_list()
end
def save_samples!(tasks, path) do
File.write!(
path,
tasks |> Enum.map(&Map.from_struct/1) |> Enum.map(&Jason.encode!/1) |> Enum.join("\n")
)
end
def load_samples!(path) do
path
|> File.stream!()
|> Stream.map(&Jason.decode!(&1, keys: :atoms))
|> Stream.map(&struct(ElixirEval.Sample, &1))
|> Enum.to_list()
end
end
defmodule Helper do
def show_human_eval_tasks(tasks) do
show_human_eval_tasks(tasks, 0, Kino.Frame.new())
end
def show_human_eval_tasks(tasks, i, frame) do
previous_button = Kino.Control.button("Previous")
next_button = Kino.Control.button("Next")
Kino.listen(previous_button, fn _event ->
Kino.Frame.clear(frame)
show_human_eval_tasks(tasks, max(i - 1, 0), frame)
end)
Kino.listen(next_button, fn _event ->
Kino.Frame.clear(frame)
show_human_eval_tasks(tasks, min(i + 1, length(tasks) - 1), frame)
end)
Kino.Frame.clear(frame)
task = Enum.at(tasks, i)
Kino.Frame.append(
frame,
Kino.Layout.grid([
Kino.Layout.grid(
[previous_button, Kino.Shorts.text("Task #{i + 1}/#{length(tasks)}"), next_button],
columns: 3
),
show_human_eval_task(task)
])
)
frame
end
def show_human_eval_task(task) do
Kino.Layout.grid(
[
Kino.Shorts.markdown("""
## Task: #{task["task_id"]}
### Prompt:
```python
#{task["prompt"]}
```
"""),
Kino.Shorts.markdown("""
### Canonical Solution:
```python
#{task["canonical_solution"]}
```
"""),
Kino.Shorts.markdown("""
### Test:
```python
#{task["test"]}
```
""")
],
columns: 1,
boxed: true
)
end
def show_tasks(tasks) do
show_tasks(tasks, 0, Kino.Frame.new())
end
def show_tasks(tasks, i, frame) do
previous_button = Kino.Control.button("Previous")
next_button = Kino.Control.button("Next")
Kino.listen(previous_button, fn _event ->
Kino.Frame.clear(frame)
show_tasks(tasks, max(i - 1, 0), frame)
end)
Kino.listen(next_button, fn _event ->
Kino.Frame.clear(frame)
show_tasks(tasks, min(i + 1, length(tasks) - 1), frame)
end)
Kino.Frame.clear(frame)
task = Enum.at(tasks, i)
Kino.Frame.append(
frame,
Kino.Layout.grid([
Kino.Layout.grid(
[previous_button, Kino.Shorts.text("Task #{i + 1}/#{length(tasks)}"), next_button],
columns: 3
),
show_task(task)
])
)
frame
end
def show_task(%ElixirEval.Task{} = task) do
Kino.Layout.grid(
[
Kino.Shorts.markdown("""
## Elixir Eval Task: #{task.id}
### Code Context Before:
```elixir
#{task.code_context_before}
```
"""),
Kino.Shorts.markdown("""
### Canonical Solution:
```elixir
#{task.canonical_solution}
```
"""),
Kino.Shorts.markdown("""
### Tests:
```elixir
#{task.tests}
```
""")
],
columns: 1,
boxed: true
)
end
end
Convert HumanEval to Elixir Tasks
First we load the HumanEval dataset
human_eval_tasks =
Path.join(__DIR__, "files/source_benchmarks/HumanEval.jsonl")
|> File.stream!()
|> Stream.map(&Jason.decode!/1)
|> Enum.to_list()
hd(human_eval_tasks)
Helper.show_human_eval_tasks(human_eval_tasks)
We’ll now do some sanity checks to see how best to convert the tasks to Elixir.
human_eval_tasks
|> Enum.map(&String.split(&1["prompt"], "\"\"\""))
|> Enum.map(&length/1)
|> Enum.frequencies()
Okay, so it looks like there are 142 tasks that just have one docstring that are easy to parse. Let’s look at the others
strange_tasks =
human_eval_tasks
|> Enum.filter(
&(length(String.split(&1["prompt"], "\"\"\"")) != 3 &&
length(String.split(&1["prompt"], "'''")) != 3)
)
# Helper.show_human_eval_tasks(strange_tasks)
Okay, looks like the 17 tasks without “”” simply use ‘’’ - no problem there.
Four of the five tasks with four sets of triple-quotes have a helper method that is defined before the main function docstring. This means we can simply use the contents of the last docstring in the task prompt. We will, however, have to translate those helper methods to Elixir.
The final one, HumanEval/64 has a docstring at the top called FIX that indicates that the authors of HumanEval needed to add more test cases 🤣
We can manually fix the five weird tests and now build a struct that can translate human-eval tasks to Elixir
Then we’ll
-
Build evaluation harness to run tests against a sample solution
-
Prompt GPT4 to generate
n
sample solutions for each task -
Evaluate GPT4’s pass@k metric for k << n
-
If GPT4 does really well, we can probably use a passing solution as canonical solution. If not, maybe we try asking GPT4 to translate from python for the failing ones. We can also manually fix badly done canonical solutions later on
defmodule HumanEval do
@doc """
human-eval tasks have the fields
- "task_id" => "HumanEval/0",
- "canonical_solution" => " for idx, elem in enumerate(numbers):\n..."
- "entry_point" => "has_close_elements"
- "prompt" => "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n",
- "test" => "..."
"""
def to_elixir_task(task) do
%ElixirEval.Task{
id: task["task_id"],
code_context_mix_setup: "",
code_context_before: code_context_before(task),
code_context_after: "",
tests: tests(task),
surface_level_conditions: ""
}
end
def code_context_before(task) do
"""
defmodule #{module_name(task)} do
#{helper_methods(task)}
@doc ~s\"\"\"
#{docstring(task)}
\"\"\"
#{function_head(task)}
"""
end
@doc ~s"""
Four human-eval tasks have helper methods that we've manually translated
"""
def helper_methods(task) do
case task["task_id"] do
# def is_palindrome(string: str) -> bool:
# """ Test if given string is a palindrome """
# return string == string[::-1]
"HumanEval/10" ->
~c"""
@doc \"""
Test if given string is a palindrome
\"""
def is_palindrome(string), do: String.reverse(string) == string
"""
# def poly(xs: list, x: float):
# """
# Evaluates polynomial with coefficients xs at point x.
# return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
# """
# return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
"HumanEval/32" ->
~c"""
@doc \"""
Evaluates polynomial with coefficients xs at point x.
return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
\"""
def poly(xs, x) do
xs
|> Enum.with_index()
|> Enum.reduce(0, fn {coeff, i}, acc -> acc + coeff * :math.pow(x, i) end)
end
"""
# def encode_cyclic(s: str):
# """
# returns encoded string by cycling groups of three characters.
# """
# # split string to groups. Each of length 3.
# groups = [s[(3 * i):min((3 * i + 3), len(s))] for i in range((len(s) + 2) // 3)]
# # cycle elements in each group. Unless group has fewer elements than 3.
# groups = [(group[1:] + group[0]) if len(group) == 3 else group for group in groups]
# return "".join(groups)
"HumanEval/38" ->
~c"""
@doc \"""
returns encoded string by cycling groups of three characters.
\"""
def encode_cyclic(s) do
s
|> String.graphemes()
|> Enum.chunk_every(3)
|> Enum.map(fn
[a, b, c] -> [b, c, a]
[a, b] -> [a, b]
[a] -> [a]
end)
|> List.flatten()
|> Enum.join()
end
"""
# def encode_shift(s: str):
# """
# returns encoded string by shifting every character by 5 in the alphabet.
# """
# return "".join([chr(((ord(ch) + 5 - ord("a")) % 26) + ord("a")) for ch in s])
"HumanEval/50" ->
~c"""
@doc \"""
returns encoded string by shifting every character by 5 in the alphabet.
\"""
def encode_shift(s) do
s
|> String.to_charlist()
|> Enum.map(&((rem((&1 + 5 - ?a), 26)) + ?a))
|> to_string()
end
"""
_ ->
""
end
end
def module_name(task), do: task["task_id"] |> String.replace("/", "")
@doc ~s"""
Generates the elixir function head for this task. Here's what they look like
def rolling_max(numbers: List[int]) -> List[int]:
def make_palindrome(string: str) -> str:
def concatenate(strings: List[str]) -> str:
def change_base(x: int, base: int):
We need to remove the type information and replace everything after the parenthesis with ` do`
TODO: Generate elixir typespecs
"""
def function_head(task) do
params_pattern = ~r/def .+\((.*?)\)( -> .+)?:/
params =
task["prompt"]
|> String.split("\n")
|> Enum.find(&String.contains?(&1, "def #{task["entry_point"]}"))
# this is row with python function head
|> String.replace(params_pattern, "\\1")
# now just the comma-separated params with optional type info
|> String.split(", ")
# [["a", "int"], ["b", "int"]]
|> Enum.map(&String.split(&1, ": "))
|> Enum.map(&hd/1)
|> Enum.join(", ")
"def #{function_name(task)}(#{params}) do"
end
@doc ~s"""
The python docstrings are indented two spaces so we need to fix that
TODO: turn examples into doctests (problem is they come in a number of different formats)
"""
def docstring(task) do
case String.split(task["prompt"], "\"\"\"") do
[_, docstring, _] -> docstring
# 5/164 cases have a second docstring
[_, _helper_docstring, _helper_impl, docstring, _] -> docstring
# some tasks use single backticks (but none of those have a helper function)
[prompt] -> String.split(prompt, "'''") |> Enum.at(1)
end
|> String.replace("\n ", "\n")
end
@doc """
The human eval dataset isn't super clean. There are some camel case and capitalised names in there
"""
def function_name(task) do
task["entry_point"]
|> String.replace(~r/[A-Z]/, fn match -> "_" <> String.downcase(match) end)
|> String.trim_leading("_")
end
def tests(task) do
"""
defmodule #{module_name(task)}Test do
import ExUnit.Assertions
def run_tests() do
#{test_assertions(task)}
end
end
"""
end
@doc """
Returns the actual elixir code. All python tests have some metadata on top (which we discard)
and then start with
def check(candidate):
Followed in most cases by either assertions or comments.
A very small number of human eval tasks have additional bespoke code.
We need to
- fix a small number of tests that aren't just a list of assertions and comments
- support two different styles of comments
- replace single quotes with double qoutes
- fix exponential notation (1e-6 in python and 1.0e-6 in elixir)
- turn python tuples into elixir tuples (e.g. in sum_product([1, 2, 3, 4]) == (10, 24))
"""
def test_assertions(%{"task_id" => "HumanEval/32"}) do
# For some reason this task has non-deterministic, randomly generated tests:
#
# import math
# import random
# rng = random.Random(42)
# import copy
# for _ in range(100):
# ncoeff = 2 * rng.randint(1, 4)
# coeffs = []
# for _ in range(ncoeff):
# coeff = rng.randint(-10, 10)
# if coeff == 0:
# coeff = 1
# coeffs.append(coeff)
# solution = HumanEval32.find_zero(copy.deepcopy(coeffs))
# assert math.fabs(poly(coeffs, solution)) < 1.0e-4
"""
:rand.seed(:exsplus, {42, 42, 42})
for _ <- 1..100 do
coeffs = for _ <- 1..(2 * Enum.random(1..4)), do: Enum.random(-10..10)
solution = HumanEval32.find_zero(coeffs)
assert abs(HumanEval32.poly(coeffs, solution) < 1.0e-4)
end
"""
end
def test_assertions(%{"task_id" => "HumanEval/33"}) do
# This one looks a bit funny - is just faster to hand-roll it
# def check(candidate):
# assert tuple(candidate([1, 2, 3])) == tuple(sort_third([1, 2, 3]))
# assert tuple(candidate([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10])) == tuple(sort_third([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10]))
# assert tuple(candidate([5, 8, -12, 4, 23, 2, 3, 11, 12, -10])) == tuple(sort_third([5, 8, -12, 4, 23, 2, 3, 11, 12, -10]))
# assert tuple(candidate([5, 6, 3, 4, 8, 9, 2])) == tuple([2, 6, 3, 4, 8, 9, 5])
# assert tuple(candidate([5, 8, 3, 4, 6, 9, 2])) == tuple([2, 8, 3, 4, 6, 9, 5])
# assert tuple(candidate([5, 6, 9, 4, 8, 3, 2])) == tuple([2, 6, 9, 4, 8, 3, 5])
# assert tuple(candidate([5, 6, 3, 4, 8, 9, 2, 1])) == tuple([2, 6, 3, 4, 8, 9, 5, 1])
"""
assert HumanEval33.sort_third([1, 2, 3]) == HumanEval33.sort_third([1, 2, 3])
assert HumanEval33.sort_third([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10]) == HumanEval33.sort_third([5, 3, -5, 2, -3, 3, 9, 0, 123, 1, -10])
assert HumanEval33.sort_third([5, 8, -12, 4, 23, 2, 3, 11, 12, -10]) == HumanEval33.sort_third([5, 8, -12, 4, 23, 2, 3, 11, 12, -10])
assert HumanEval33.sort_third([5, 6, 3, 4, 8, 9, 2]) == [2, 6, 3, 4, 8, 9, 5]
assert HumanEval33.sort_third([5, 8, 3, 4, 6, 9, 2]) == [2, 8, 3, 4, 6, 9, 5]
assert HumanEval33.sort_third([5, 6, 9, 4, 8, 3, 2]) == [2, 6, 9, 4, 8, 3, 5]
assert HumanEval33.sort_third([5, 6, 3, 4, 8, 9, 2, 1]) == [2, 6, 3, 4, 8, 9, 5, 1]
"""
end
def test_assertions(%{"task_id" => "HumanEval/38"}) do
# from random import randint, choice
# import string
# letters = string.ascii_lowercase
# for _ in range(100):
# str = "".join(choice(letters) for i in range(randint(10, 20)))
# encoded_str = encode_cyclic(str)
# assert candidate(encoded_str) == str
"""
:rand.seed(:exsplus, {42, 42, 42})
for _ <- 1..100 do
str = List.duplicate(0, Enum.random(10..20))
|> Enum.map(fn _ -> Enum.random(?a..?z) end)
|> to_string()
assert HumanEval38.encode_cyclic(HumanEval38.encode_cyclic(str)) == str
end
"""
end
def test_assertions(%{"task_id" => "HumanEval/44"}) do
"""
assert HumanEval44.change_base(8, 3) == "22"
assert HumanEval44.change_base(9, 3) == "100"
assert HumanEval44.change_base(234, 2) == "11101010"
assert HumanEval44.change_base(16, 2) == "10000"
assert HumanEval44.change_base(8, 2) == "1000"
assert HumanEval44.change_base(7, 2) == "111"
for x <- 2..8 do
assert HumanEval44.change_base(x, x + 1) == Integer.to_string(x)
end
"""
end
def test_assertions(%{"task_id" => "HumanEval/50"}) do
# from random import randint, choice
# import copy
# import string
# letters = string.ascii_lowercase
# for _ in range(100):
# str = "".join(choice(letters) for i in range(randint(10, 20)))
# encoded_str = encode_shift(str)
# assert candidate(copy.deepcopy(encoded_str)) == str
"""
:rand.seed(:exsplus, {42, 42, 42})
for _ <- 1..100 do
random_string = for _ <- 1..Enum.random(10..20), into: "", do: <>
assert HumanEval50.decode_shift(HumanEval50.encode_shift(random_string)) == random_string
end
"""
end
def test_assertions(%{"task_id" => "HumanEval/53"}) do
# assert HumanEval53.add(0, 1) == 1
# assert HumanEval53.add(1, 0) == 1
# assert HumanEval53.add(2, 3) == 5
# assert HumanEval53.add(5, 7) == 12
# assert HumanEval53.add(7, 5) == 12
# for i in range(100):
# x, y = random.randint(0, 1000), random.randint(0, 1000)
# assert HumanEval53.add(x, y) == x + y
"""
assert HumanEval53.add(0, 1) == 1
assert HumanEval53.add(1, 0) == 1
assert HumanEval53.add(2, 3) == 5
assert HumanEval53.add(5, 7) == 12
assert HumanEval53.add(7, 5) == 12
:rand.seed(:exsplus, {42, 42, 42})
for _ <- 1..100 do
{x, y} = {Enum.random(0, 1000), Enum.random(0, 1000)}
assert HumanEval53.add(x, y) == x + y
end
"""
end
def test_assertions(%{"task_id" => "HumanEval/87"}) do
"""
# Check some simple cases
assert HumanEval87.get_row([
[1,2,3,4,5,6],
[1,2,3,4,1,6],
[1,2,3,4,5,1]
], 1) == [{0, 0}, {1, 4}, {1, 0}, {2, 5}, {2, 0}]
assert HumanEval87.get_row([
[1,2,3,4,5,6],
[1,2,3,4,5,6],
[1,2,3,4,5,6],
[1,2,3,4,5,6],
[1,2,3,4,5,6],
[1,2,3,4,5,6]
], 2) == [{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}]
assert HumanEval87.get_row([
[1,2,3,4,5,6],
[1,2,3,4,5,6],
[1,1,3,4,5,6],
[1,2,1,4,5,6],
[1,2,3,1,5,6],
[1,2,3,4,1,6],
[1,2,3,4,5,1]
], 1) == [{0, 0}, {1, 0}, {2, 1}, {2, 0}, {3, 2}, {3, 0}, {4, 3}, {4, 0}, {5, 4}, {5, 0}, {6, 5}, {6, 0}]
assert HumanEval87.get_row([], 1) == []
assert HumanEval87.get_row([[1]], 2) == []
assert HumanEval87.get_row([[], [1], [1, 2, 3]], 3) == [{2, 2}]
"""
end
def test_assertions(%{"task_id" => "HumanEval/95"}) do
"""
assert HumanEval95.check_dict_case(%{"p" => "pineapple", "b" => "banana"}) == True
assert HumanEval95.check_dict_case(%{"p" => "pineapple", "A" => "banana", "B" => "banana"}) == False
assert HumanEval95.check_dict_case(%{"p" => "pineapple", 5 => "banana", "a" => "apple"}) == False
assert HumanEval95.check_dict_case(%{"Name" => "John", "Age" => "36", "City" => "Houston"}) == False
assert HumanEval95.check_dict_case(%{"STATE" => "NC", "ZIP" => "12345" }) == True
assert HumanEval95.check_dict_case(%{"fruit" => "Orange", "taste" => "Sweet" }) == True
"""
end
def test_assertions(%{"task_id" => "HumanEval/111"}) do
"""
# Check some simple cases
assert HumanEval111.histogram("a b b a") == %{"a" => 2,"b" => 2}
assert HumanEval111.histogram("a b c a b") == %{"a" => 2, "b" => 2}
assert HumanEval111.histogram("a b c d g") == %{"a" => 1, "b" => 1, "c" => 1, "d" => 1, "g" => 1}
assert HumanEval111.histogram("r t g") == %{"r" => 1,"t" => 1,"g" => 1}
assert HumanEval111.histogram("b b b b a") == %{"b" => 4}
assert HumanEval111.histogram("r t g") == %{"r" => 1,"t" => 1,"g" => 1}
# Check some edge cases that are easy to work out by hand.
assert HumanEval111.histogram("") == %{}
assert HumanEval111.histogram("a") == %{"a" => 1}
"""
end
def test_assertions(%{"task_id" => "HumanEval/127"}) do
"""
assert HumanEval127.intersection({1, 2}, {2, 3}) == "NO"
assert HumanEval127.intersection({-1, 1}, {0, 4}) == "NO"
assert HumanEval127.intersection({-3, -1}, {-5, 5}) == "YES"
assert HumanEval127.intersection({-2, 2}, {-4, 0}) == "YES"
# Check some edge cases that are easy to work out by hand.
assert HumanEval127.intersection({-11, 2}, {-1, -1}) == "NO"
assert HumanEval127.intersection({1, 2}, {3, 5}) == "NO"
assert HumanEval127.intersection({1, 2}, {1, 2}) == "NO"
assert HumanEval127.intersection({-2, -2}, {-3, -2}) == "NO"
"""
end
def test_assertions(%{"task_id" => "HumanEval/148"}) do
"""
# Check some simple cases
assert HumanEval148.bf("Jupiter", "Neptune") == {"Saturn", "Uranus"}
assert HumanEval148.bf("Earth", "Mercury") == {"Venus"}
assert HumanEval148.bf("Mercury", "Uranus") == {"Venus", "Earth", "Mars", "Jupiter", "Saturn"}
assert HumanEval148.bf("Neptune", "Venus") == {"Earth", "Mars", "Jupiter", "Saturn", "Uranus"}
# Check some edge cases that are easy to work out by hand.
assert HumanEval148.bf("Earth", "Earth") == {}
assert HumanEval148.bf("Mars", "Earth") == {}
assert HumanEval148.bf("Jupiter", "Makemake") == {}
"""
end
def test_assertions(%{"task_id" => "HumanEval/151"}) do
"""
# Check some simple cases
assert HumanEval151.double_the_difference([]) == 0
assert HumanEval151.double_the_difference([5, 4]) == 25
assert HumanEval151.double_the_difference([0.1, 0.2, 0.3]) == 0
assert HumanEval151.double_the_difference([-10, -20, -30]) == 0
# Check some edge cases that are easy to work out by hand.
assert HumanEval151.double_the_difference([-1, -2, 8]) == 0
assert HumanEval151.double_the_difference([0.2, 3, 5]) == 34
lst = -99..100 |> Enum.filter(&(rem(&1, 2)!=0))
odd_sum = lst |> Enum.filter(&(&1 > 0)) |> Enum.map(&(&1**2)) |> Enum.sum()
assert HumanEval151.double_the_difference(lst) == odd_sum
"""
end
def test_assertions(task) do
task["test"]
|> String.split("def check(candidate):")
|> Enum.at(1)
|> String.replace("candidate(", "#{module_name(task)}.#{function_name(task)}(")
|> String.replace("'", "\"")
|> String.replace(" print\n", "")
|> replace_e_notation()
|> replace_tuples()
|> replace_is()
|> fix_asserts()
# |> Enum.filter(&(!String.starts_with?(&1, "\n assert")))
# |> Enum.filter(&(!String.starts_with?(&1, "\n\n # C")))
end
@doc ~s"""
## Examples:
iex> HumanEval.replace_e_notation("bla 1e6 1.2e-3 2.2e2")
"bla 1.0e6 1.2e-3 2.2e2"
"""
def replace_e_notation(str) do
regexp = ~r/( [0-9]+)(e[-]?\d+)/
str |> String.replace(regexp, "\\1.0\\2")
end
@doc ~s"""
## Examples:
iex> HumanEval.replace_tuples("== (1, 2)")
"== {1, 2}"
iex> HumanEval.replace_tuples("== (1.0, 2.2)")
"== {1.0, 2.2}"
iex> HumanEval.replace_tuples("bla tuple([a, b]) bla tuple([c, d])")
"bla ([a, b]) bla ([c, d])"
iex> HumanEval.replace_tuples(~s{assert Bla.blo(9) == (4, 5), "(good)"})
"assert Bla.blo(9) == {4, 5}, \\"(good)\\""
iex> HumanEval.replace_tuples("assert Bla.blo(9) == (4,5)")
"assert Bla.blo(9) == {4,5}"
"""
def replace_tuples(str) do
str
|> String.replace(~r/== \(([^\)]+),([^\)]+)\)/, "== {\\1,\\2}")
|> String.replace(~r/tuple\(/, "(")
end
@doc """
## Examples:
iex> HumanEval.replace_is("assert bla is True")
"assert bla == True"
iex> HumanEval.replace_is("assert bla is False")
"assert bla == False"
"""
def replace_is(str) do
str
|> String.replace("is True", "== True")
|> String.replace("is False", "== False")
end
@doc """
## Examples:
iex> HumanEval.fix_asserts(~s{ assert a == b, "Bananas"})
" assert a == b"
iex> HumanEval.fix_asserts(~s{ assert a == b})
" assert a == b"
iex> HumanEval.fix_asserts(~s{ assert HumanEval77.iscube(1) == True, "First test error: " + str(HumanEval77.iscube(1))})
" assert HumanEval77.iscube(1) == True"
iex> HumanEval.fix_asserts(~s{ assert HumanEval78.hex_key('a') == 1, "First test error: " + str(HumanEval78.hex_key("AB")))})
" assert HumanEval78.hex_key('a') == 1"
iex> HumanEval.fix_asserts(~s{assert bla(blo) == [a, b, c]})
"assert bla(blo) == [a, b, c]"
iex> HumanEval.fix_asserts("assert bla([10]) == {10, 2}")
"assert bla([10]) == {10, 2}"
iex> HumanEval.fix_asserts("assert (bla([1, 2, 3])) == ([1, 2])")
"assert (bla([1, 2, 3])) == ([1, 2])"
iex> HumanEval.fix_asserts("assert (bla([1, 2, 3])) == ([1, 2])")
"assert (bla([1, 2, 3])) == ([1, 2])"
iex> HumanEval.fix_asserts(~s{assert bla(148, 412) == 16, "First test error: " + str(multiply(148, 12))})
"assert bla(148, 412) == 16"
iex> HumanEval.fix_asserts(~s{assert bla("Mary lamb", 4) == ["little"], "bla" <> str(bla("Mary", 4))})
"assert bla(\\"Mary lamb\\", 4) == [\\"little\\"]"
iex> HumanEval.fix_asserts(~s{assert True, "Hello"})
"assert True"
"""
def fix_asserts(str) do
str
|> String.replace(~r/^(\s*assert .+ == [^{(\\[].*), .+$/m, "\\1")
|> String.replace(~r/^(\s*assert .+ == [a-zA-Z0-9]+), .+$/m, "\\1")
|> String.replace(~r/^(\s*assert True), .+$/m, "\\1")
|> String.replace(~r/^(\s*assert .+ == \[.*\]), .+$/m, "\\1")
end
end
lst = -99..100 |> Enum.filter(&(rem(&1, 2) != 0))
odd_sum = lst |> Enum.filter(&(&1 > 0)) |> Enum.map(&(&1 ** 2)) |> Enum.sum()
elixir_eval_tasks = human_eval_tasks |> Enum.map(&HumanEval.to_elixir_task/1)
Let’s do some sanity checking - all code should compile if we add two ends to it
elixir_eval_tasks
|> Enum.each(fn task ->
Code.eval_string(
"""
#{task.code_context_before}
end
end
#{task.tests}
""",
[],
file: "Code.eval_string #{task.id}"
)
end)
ElixirEval.save_tasks!(
elixir_eval_tasks,
Path.join(__DIR__, "files/elixir_benchmarks/human_eval.jsonl")
)
Testing
Lowest level
-
Input:
- EvalTask
-
Sample
- task_id
- completion
defmodule ElixirEval.Evaluator do
# TODO store the module name in the task definition
def module_name(task), do: task.id |> String.replace("/", "")
@doc """
Returns {:ok} or {:error, err}
TODO return the following things
Success
- how many assertions passed
- the time it took
Failure
- syntax error and message
- assertion error and message
- any other unexpected error
Opts
- timeout in ms (default 500)
"""
def eval_sample(task, sample_solution, opts \\ []) do
timeout = Keyword.get(opts, :timeout, 500)
task =
Task.async(fn ->
try do
code = """
#{task.code_context_before}#{sample_solution}#{task.code_context_after}
#{task.tests}
# try do
#{ElixirEval.Evaluator.module_name(task)}Test.run_tests()
# rescue
# e ->
# IO.puts("Runtime error " <> inspect(e))
# end
"""
IO.puts(code)
Code.eval_string(code)
{:ok}
rescue
exception ->
{:error, exception}
end
end)
case Task.yield(task, timeout) do
nil ->
IO.puts("TIMEOUT ")
Task.shutdown(task, :brutal_kill)
{:error, :timeout}
{:ok, {:error, exception}} ->
{:error, exception}
{:ok, return} ->
return
{:exit, reason} ->
{:error, reason}
end
# rescue
# exception -> {:error, exception}
end
end
elixir_human_eval_tasks =
ElixirEval.load_tasks!(Path.join(__DIR__, "files/elixir_benchmarks/human_eval.jsonl"))
Generate samples with GPT4
defmodule ElixirEval.GPT4 do
@default_opts [
samples_per_task: 1,
num_processes: 1,
prompt: """
Complete following Elixir program correctly.
Only respond with source code and no other explanation.
Do not repeat the code I gave you, just return the new code needed to complete the function and module.
```elixir
```
"""
,
model: "gpt-4-1106-preview",
temperature: 0.3,
max_tokens: 256,
top_p: 0.95,
frequency_penalty: 0,
presence_penalty: 0,
stop: []
]
@doc """
Opts
samples_per_task: number of sample solutions to generate per task (default 1)
num_processes: number of concurrent processes to use (default 1)
model: model to use, defaults to (default gpt-4-1106-preview)
temperature: temperature (default 0.3)
max_tokens: the maximum number of tokens to return (default 256)
top_p: controls nucleus sampling(default 0.95)
frequency_penalty: repeat penalty (default 0)
presence_penalty: penalty for tokens that already exist in output (default 0),
stop: list of stop words that end token generation (default [])
"""
def generate_samples(tasks, opts \\ []) do
opts = opts |> Keyword.validate!(@default_opts)
{samples_per_task, opts} = Keyword.pop!(opts, :samples_per_task)
{num_processes, opts} = Keyword.pop!(opts, :num_processes)
{prompt_template, opts} = Keyword.pop!(opts, :prompt)
Stream.flat_map(tasks, fn task ->
for i <- 0..(samples_per_task - 1), do: {task, i}
end)
|> Task.async_stream(
fn {task, i} ->
IO.puts("Task #{task.id}, sample #{i}")
{task.id, i, completion(build_prompt(prompt_template, task), opts)}
end,
max_concurrency: num_processes,
timeout: :infinity
)
|> Stream.map(fn {:ok, {task_id, i, completion}} ->
%ElixirEval.Sample{
task_id: task_id,
sample_id: i,
completion: completion
}
end)
|> Enum.to_list()
# requests = for i <- 1..samples_per_task,
# task <- tasks do
# {task, i, build_prompt(prompt_template, task)}
# end
# end
# IO.inspect(requests)
end
def openai do
System.fetch_env!("LB_OPENAI_API_KEY") |> OpenaiEx.new()
end
def completion(prompt, opts) do
chat_req =
OpenaiEx.ChatCompletion.new(
Keyword.put(opts, :messages, [OpenaiEx.ChatMessage.user(prompt)])
)
# IO.inspect(chat_req)
chat_response = openai() |> OpenaiEx.ChatCompletion.create(chat_req)
chat_response["choices"] |> hd |> Map.get("message") |> Map.get("content") |> process_output()
end
def build_prompt(prompt_template, task) do
String.replace(prompt_template, ""
, task.code_context_before)
end
@doc """
Trims triple backticks
Examples:
iex> ElixirEval.GPT4.process_output("abc")
"abc"
iex> ElixirEval.GPT4.process_output("```elixir\\n```")
""
iex> ElixirEval.GPT4.process_output("```elixir\\nbla\\n```")
"bla"
"""
def process_output(str) do
str |> String.split("\n") |> Enum.reject(&String.starts_with?(&1, "```")) |> Enum.join("\n")
end
end
# prompt = """
# Translate this python code into Elixir.
# Place the Elixir function #{task["entry_point"]} in a module called Test01.
# Don't change the function signature.
# No need to use typespecs.
# Only respond with the Elixir code.
# Only use functions that actually exist in Elixir.
# Don't just assume something that exists in python will exist in Elixir.
# Important: Convert the python docstrings (the ones with the triple-quotes) into Elixir
# Ignore the python imports.
# Here is an example of a successful conversion:
# Python prompt
# chat_req =
# ChatCompletion.new(
# model: "gpt-4-1106-preview",
# messages: [
# ChatMessage.user(prompt)
# ]
# )
# chat_response = openai |> ChatCompletion.create(chat_req)
# Kino.Shorts.markdown("""
# #{chat_response["choices"] |> hd |> Map.get("message") |> Map.get("content")}
# """)
tasks = elixir_eval_tasks
solution_samples =
ElixirEval.GPT4.generate_samples(tasks,
samples_per_task: 1,
num_processes: 30,
frequency_penalty: 1.1
)
ElixirEval.save_samples!(
solution_samples,
Path.join(__DIR__, "files/solution_samples/gpt4_default.jsonl")
)
samples =
ElixirEval.load_samples!(Path.join(__DIR__, "files/solution_samples/gpt4_default.jsonl"))
samples
|> Enum.map(fn sample ->
task = tasks |> Enum.find(&(&1.id == sample.task_id))
case ElixirEval.Evaluator.eval_sample(task, sample.completion) do
{:ok} -> :ok
{status, _} -> status
end
end)
|> Enum.frequencies()
solution_samples
|> Enum.map(fn sample ->
IO.puts(sample.completion)
"""
### Task #{sample.task_id}, sample #{sample.sample_id}
```elixir
#{sample.completion}
```
"""
end)
|> Enum.join()
|> Kino.Shorts.markdown()