TogetherAI training
Mix.install([
:req,
:jason
])
Create dataset for Ruby
Fetch Advent of Code dataset from huggingface
dataset_url = "https://huggingface.co/datasets/isavita/advent-of-code/resolve/main/train.json"
dataset_orig = Req.get!(dataset_url, receive_timeout: 600_000).body
# get only go solutions, because go is the only language that the dataset has all challenges solved
dataset = Enum.filter(dataset_orig, fn challenge -> challenge["solution_lang"] == "ruby" end)
Format the dataset according to TogetherAI fine tuning models list here for teknium/OpenHermes-2p5-Mistral-7B
and togethercomputer/CodeLlama-7b-Python
Install together cli
pip install together
Set env TOGETHER_API_KEY
export TOGETHER_API_KEY=...
Find formating for the model
together models info "teknium/OpenHermes-2p5-Mistral-7B"
For teknium/OpenHermes-2p5-Mistral-7B
the format is
{
'stop': ['<|im_end|>',
'<|im_start|>'],
'prompt_format': '<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n', 'add_generation_prompt': True,
'chat_template_name': 'default'
}
For togethercomputer/CodeLlama-7b-Python
the format is
TEMPLATE """[INST] <>{{ .System }}<>
{{ .Prompt }} [/INST]
"""
PARAMETER rope_frequency_base 1e+06
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"
PARAMETER stop "<>"
PARAMETER stop "<>"
Define formatter function
formatter_openhermes_fn = fn challenge ->
~s"""
<|im_start|>user
You are an expert programmer who writes simple, efficient and concise code.
Write a #{challenge["solution_lang"]} program that reads its input from a file named "input.txt" and solves the following task.
The program should print the answer when executed.
Coding challenge:
#{challenge["task"]}
<|im_end|>
<|im_start|>assistant
```#{challenge["solution_lang"]}
#{challenge["solution"]}
```
<|im_end|>
"""
|> String.trim()
end
formatter_with_go_solution_openhermes_fn = fn challenge, go_solution ->
~s"""
<|im_start|>user
You are an expert programmer that writes simple, concise code without comments or explanations.
Given this Golang program, write a #{challenge["solution_lang"]} version that reads its input from a file named "input.txt" and solves the same task.
The program should print the answer when executed.
The program MUST READ its input from a file named "input.txt".
Golang Solution:
```go
#{go_solution}
```
<|im_end|>
<|im_start|>assistant
```#{challenge["solution_lang"]}
#{challenge["solution"]}
```
<|im_end|>
"""
|> String.trim()
end
formatter_codellama_fn = fn challenge ->
~s"""
[INST]
<>
You are an expert programmer who writes simple, efficient and concise code.<>
Write a #{challenge["solution_lang"]} program that reads its input from a file named "input.txt" and solves the following task.
The program should print the answer when executed.
<>
Coding challenge:
#{challenge["task"]}
[/INST]
```#{challenge["solution_lang"]}
#{challenge["solution"]}
```
"""
|> String.trim()
end
formatter_with_go_solution_codellama_fn = fn challenge, go_solution ->
~s"""
[INST]
<>
You are an expert programmer that writes simple, concise code without comments or explanations.
Given this Golang program, write a #{challenge["solution_lang"]} version that reads its input from a file named "input.txt" and solves the same task.
The program should print the answer when executed.
The program MUST READ its input from a file named "input.txt".
<>
Golang Solution:
```go
#{go_solution}
```
[/INST]
```#{challenge["solution_lang"]}
#{challenge["solution"]}
```
"""
|> String.trim()
end
Format the dataset
data1 =
Enum.map(dataset, fn challenge ->
%{"text" => formatter_codellama_fn.(challenge)}
end)
go_solutions_lookkup =
for day <- dataset_orig, day["solution_lang"] == "go", into: %{} do
{day["name"], day["solution"]}
end
data2 =
dataset
|> Enum.filter(&(&1["solution_lang"] != "go"))
|> Enum.map(fn challenge ->
%{
"text" =>
formatter_with_go_solution_codellama_fn.(
challenge,
go_solutions_lookkup[challenge["name"]]
)
}
end)
data = data1 ++ data2
Save the data in .jsonl
file
output_path =
"#{File.cwd!()}/code/advent_generated/training_data/togetherai_codellama_ruby_#{Enum.count(data)}_#{Date.utc_today()}.jsonl"
file = File.open!(output_path, [:write, :create])
:ok =
Enum.each(data, fn raw ->
json = Jason.encode!(raw)
IO.binwrite(file, json <> "\n")
end)
output_path
Check the data
together files check
Upload the data
together files upload
Get the file_id from the previous command
openhermes_file_id = "file-57ad61ef-ea44-41cc-b833-256a2d78b265"
codellama_file_id = "file-c9f34dd5-7706-4e70-bdf5-0d9cf332ca79"
together finetune create options
options:
-h, --help show this help message and exit
--training-file FILE-ID, -t FILE-ID
File-ID of an uploaded file that contains training data.
--estimate-price, -e Estimate the price of the fine tune job
--model MODEL, -m MODEL
The name of the base model to fine-tune. Default='togethercomputer/RedPajama-INCITE-7B-Chat'.
--n-epochs EPOCHS, -ne EPOCHS
The number of epochs to train the model for. Default=4
--n-checkpoints CHECKPOINTS, -c CHECKPOINTS
The number of checkpoints to save during training. Default=1 (a checkpoint is always saved on the last
epoch for the trained model).
--batch-size BATCH_SIZE, -b BATCH_SIZE
The batch size to use for training. Default=32
--learning-rate LEARNING_RATE, -lr LEARNING_RATE
The learning rate multiplier to use for training. Default=0.00001
--suffix SUFFIX, -s SUFFIX
Up to 40 characters that will be added to your fine-tuned model name.
--wandb-api-key WANDB_API_KEY, -wb WANDB_API_KEY
Wandb API key to report metrics to wandb.ai. If not set WANDB_API_KEY environment variable is used.
--no-wandb-api-key, -nwb
Do not report metrics to wandb.ai.
--quiet Indicates whether to disable checking
Create finetune job with all parameters for teknium/OpenHermes-2p5-Mistral-7B
together finetune create --model "teknium/OpenHermes-2p5-Mistral-7B" \
--training-file \
--batch-size 4 \
--n-epochs 3 \
--learning-rate 0.00001 \
--suffix Ruby \
--wandb-api-key
Create finetune job with all parameters for togethercomputer/CodeLlama-7b-Python
together finetune create --model "togethercomputer/CodeLlama-7b-Python" \
--training-file \
--batch-size 4 \
--n-epochs 3 \
--learning-rate 0.00001 \
--suffix Ruby \
--wandb-api-key
openhermes_job_id = "ft-b23765c7-7875-4792-a482-8c37d135eb50"
llamacode_job_id = "ft-98e2d8c4-f85a-47e9-98b8-16acc9ed0fdb"
together finetune status