Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

TogetherAI training

generate_training_run_on_thogether_ai.livemd

TogetherAI training

Mix.install([
  :req,
  :jason
])

Create dataset for Ruby

Fetch Advent of Code dataset from huggingface

dataset_url = "https://huggingface.co/datasets/isavita/advent-of-code/resolve/main/train.json"
dataset_orig = Req.get!(dataset_url, receive_timeout: 600_000).body

# get only go solutions, because go is the only language that the dataset has all challenges solved
dataset = Enum.filter(dataset_orig, fn challenge -> challenge["solution_lang"] == "ruby" end)

Format the dataset according to TogetherAI fine tuning models list here for teknium/OpenHermes-2p5-Mistral-7B and togethercomputer/CodeLlama-7b-Python

Install together cli
pip install together
Set env TOGETHER_API_KEY
export TOGETHER_API_KEY=...
Find formating for the model
together models info "teknium/OpenHermes-2p5-Mistral-7B"
For teknium/OpenHermes-2p5-Mistral-7B the format is
{
'stop': ['<|im_end|>',
'<|im_start|>'],
'prompt_format': '<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n', 'add_generation_prompt': True,
'chat_template_name': 'default'
}
For togethercomputer/CodeLlama-7b-Python the format is
TEMPLATE """[INST] <>{{ .System }}<>

{{ .Prompt }} [/INST]
"""
PARAMETER rope_frequency_base 1e+06
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"
PARAMETER stop "<>"
PARAMETER stop "<>"

Define formatter function

formatter_openhermes_fn = fn challenge ->
  ~s"""
  <|im_start|>user
  You are an expert programmer who writes simple, efficient and concise code.
  Write a #{challenge["solution_lang"]} program that reads its input from a file named "input.txt" and solves the following task.
  The program should print the answer when executed.

  Coding challenge:
  #{challenge["task"]}
  <|im_end|>
  <|im_start|>assistant
  ```#{challenge["solution_lang"]}
  #{challenge["solution"]}
  ```
  <|im_end|>
  """
  |> String.trim()
end

formatter_with_go_solution_openhermes_fn = fn challenge, go_solution ->
  ~s"""
  <|im_start|>user
  You are an expert programmer that writes simple, concise code without comments or explanations.
  Given this Golang program, write a #{challenge["solution_lang"]} version that reads its input from a file named "input.txt" and solves the same task.
  The program should print the answer when executed.
  The program MUST READ its input from a file named "input.txt".

  Golang Solution:
  ```go
  #{go_solution}
  ```
  <|im_end|>
  <|im_start|>assistant
  ```#{challenge["solution_lang"]}
  #{challenge["solution"]}
  ```
  <|im_end|>
  """
  |> String.trim()
end

formatter_codellama_fn = fn challenge ->
  ~s"""
  [INST]
  <>
  You are an expert programmer who writes simple, efficient and concise code.<>
  Write a #{challenge["solution_lang"]} program that reads its input from a file named "input.txt" and solves the following task.
  The program should print the answer when executed.
  <>
  Coding challenge: 
  #{challenge["task"]}
  [/INST]
  ```#{challenge["solution_lang"]}
  #{challenge["solution"]}
  ```
  """
  |> String.trim()
end

formatter_with_go_solution_codellama_fn = fn challenge, go_solution ->
  ~s"""
  [INST]
  <>
  You are an expert programmer that writes simple, concise code without comments or explanations.
  Given this Golang program, write a #{challenge["solution_lang"]} version that reads its input from a file named "input.txt" and solves the same task.
  The program should print the answer when executed.
  The program MUST READ its input from a file named "input.txt".
  <>
  Golang Solution:
  ```go
  #{go_solution}
  ```
  [/INST]
  ```#{challenge["solution_lang"]}
  #{challenge["solution"]}
  ```
  """
  |> String.trim()
end

Format the dataset

data1 =
  Enum.map(dataset, fn challenge ->
    %{"text" => formatter_codellama_fn.(challenge)}
  end)

go_solutions_lookkup =
  for day <- dataset_orig, day["solution_lang"] == "go", into: %{} do
    {day["name"], day["solution"]}
  end

data2 =
  dataset
  |> Enum.filter(&amp;(&amp;1["solution_lang"] != "go"))
  |> Enum.map(fn challenge ->
    %{
      "text" =>
        formatter_with_go_solution_codellama_fn.(
          challenge,
          go_solutions_lookkup[challenge["name"]]
        )
    }
  end)

data = data1 ++ data2

Save the data in .jsonl file

output_path =
  "#{File.cwd!()}/code/advent_generated/training_data/togetherai_codellama_ruby_#{Enum.count(data)}_#{Date.utc_today()}.jsonl"

file = File.open!(output_path, [:write, :create])

:ok =
  Enum.each(data, fn raw ->
    json = Jason.encode!(raw)
    IO.binwrite(file, json <> "\n")
  end)

output_path

Check the data

together files check 

Upload the data

together files upload 

Get the file_id from the previous command

openhermes_file_id = "file-57ad61ef-ea44-41cc-b833-256a2d78b265"
codellama_file_id = "file-c9f34dd5-7706-4e70-bdf5-0d9cf332ca79"

together finetune create options

options:
  -h, --help            show this help message and exit
  --training-file FILE-ID, -t FILE-ID
                        File-ID of an uploaded file that contains training data.
  --estimate-price, -e  Estimate the price of the fine tune job
  --model MODEL, -m MODEL
                        The name of the base model to fine-tune. Default='togethercomputer/RedPajama-INCITE-7B-Chat'.
  --n-epochs EPOCHS, -ne EPOCHS
                        The number of epochs to train the model for. Default=4
  --n-checkpoints CHECKPOINTS, -c CHECKPOINTS
                        The number of checkpoints to save during training. Default=1 (a checkpoint is always saved on the last
                        epoch for the trained model).
  --batch-size BATCH_SIZE, -b BATCH_SIZE
                        The batch size to use for training. Default=32
  --learning-rate LEARNING_RATE, -lr LEARNING_RATE
                        The learning rate multiplier to use for training. Default=0.00001
  --suffix SUFFIX, -s SUFFIX
                        Up to 40 characters that will be added to your fine-tuned model name.
  --wandb-api-key WANDB_API_KEY, -wb WANDB_API_KEY
                        Wandb API key to report metrics to wandb.ai. If not set WANDB_API_KEY environment variable is used.
  --no-wandb-api-key, -nwb
                        Do not report metrics to wandb.ai.
  --quiet               Indicates whether to disable checking
Create finetune job with all parameters for teknium/OpenHermes-2p5-Mistral-7B
together finetune create --model "teknium/OpenHermes-2p5-Mistral-7B" \
--training-file  \
--batch-size 4 \
--n-epochs 3 \
--learning-rate 0.00001 \
--suffix Ruby \
--wandb-api-key 
Create finetune job with all parameters for togethercomputer/CodeLlama-7b-Python
together finetune create --model "togethercomputer/CodeLlama-7b-Python" \
--training-file  \
--batch-size 4 \
--n-epochs 3 \
--learning-rate 0.00001 \
--suffix Ruby \
--wandb-api-key 
openhermes_job_id = "ft-b23765c7-7875-4792-a482-8c37d135eb50"
llamacode_job_id = "ft-98e2d8c4-f85a-47e9-98b8-16acc9ed0fdb"
together finetune status