Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us

Scraper Lite

docs/scraper.livemd

Scraper Lite

# Mix.install([
#   {:poison, "~> 5.0"},
#   {:floki, "~> 0.35.2"},
#   {:req, "~> 0.4.8"},
#   {:youtube_captions, "~> 0.1.0"}
# ])

Root

# defmodule R do
#   def recompile() do
#     Mix.Task.reenable("app.start")
#     Mix.Task.reenable("compile")
#     Mix.Task.reenable("compile.all")
#     compilers = Mix.compilers()
#     Enum.each(compilers, &Mix.Task.reenable("compile.#{&1}"))
#     Mix.Task.run("compile.all")
#   end
# end

R.recompile()
{:ok, src} = Vyasa.Written.create_source(%{title: "rama"})
# Vyasa.Medium.Writer.init(%)

Valmiki Ramayana

url = "https://www.valmikiramayan.net/utf8/"
# fetch from root frame
path = "baala/sarga1/"
doc = "balasans1.htm"

col =
  Finch.build(:get, url <> path <> doc)
  |> Finch.request!(Vyasa.Finch)
  |> Map.get(:body)
  |> Floki.parse_document!()
  |> Floki.find("body")
  |> List.first()
  |> elem(2)
defmodule Rama do
  # audio n+1
  def parse(data, acc) do
    case {data, acc} do
      {{"p", [{"class", "SanSloka"}], [{"audio", _, _} | _] = audio},
       [%{"count" => c} = curr | _] = acc} ->
        #  IO.inspect(audio, label: "audio")
        [src] =
          audio
          |> Floki.find("source")
          |> Floki.attribute("src")

        [
          curr
          |> Map.put("count", c + 1)
          |> Map.put("audio", src)
          | acc
        ]

      {{"p", [{"class", "SanSloka"}], [{"audio", _, _} | _] = audio}, []} ->
        [src] =
          audio
          |> Floki.find("source")
          |> Floki.attribute("src")

        [%{"count" => 1, "audio" => src}]

      # verse n + 1
      {{"p", [{"class", "SanSloka"}], verses}, [curr | acc]} ->
        [Map.put(curr, "verse", verses |> Floki.text() |> String.trim()) | acc]

      # nesting in verloc

      {{"p", [{"class", "verloc"}], [{"p", [{"class", "SanSloka"}], sloka} | _] = ns_tree},
       [curr | _] = acc}
      when is_map(curr) ->
        # IO.inspect(ns_tree)
        # IO.inspect(ns_tree)
        Enum.reduce(ns_tree, acc, &amp;parse/2)

      {{"p", [{"class", "verloc"}], rem} = c_tree, [curr | acc]} when is_map(curr) ->
        [curr | acc]

      # # n case before verse break
      {{"p", [{"class", class}], _} = c_tree, [curr | acc]} when is_map(curr) ->
        [Map.put(curr, class, c_tree |> Floki.text() |> String.trim()) | acc]

      {para, acc} ->
        # IO.inspect(para, label: "div")
        acc
    end
  end
end

output =
  col
  |> Enum.reduce([], &amp;Rama.parse/2)

# # formatting & tying loose ends
# clean_verses =
#   [Map.put(curr, "count", count + 1) | verses]
#   |> Enum.reverse()
File.mkdir_p!(Path.expand(path, "media"))

output
|> Enum.reduce([], fn
  %{"audio" => aud, "count" => 12 = count, "verse" => verse}, acc ->
    aud_bin =
      Finch.build(
        :get,
        Path.join(url <> path, aud)
        |> String.replace(~r/\.\//, "")
      )
      |> Finch.request!(Vyasa.Finch)
      |> Map.get(:body)

    m_path = Path.expand(path <> "/#{count}.mp3", "media")
    File.write!(m_path, aud_bin)

    {:ok,
     %Vyasa.Parser.MP3{
       duration: d,
       path: p,
       title: title
     }} = Vyasa.Parser.MP3.parse(m_path)

    [
      %Vyasa.Medium.Event{
        origin: 0,
        duration: d,
        fragments: [%{status: "firstpass", quote: verse}]
      }
      | acc
    ]

  _, acc ->
    acc
end)
aud =
  output
  |> Enum.find(&amp;(Map.get(&amp;1, "count") == 1))
  |> Map.get("audio")
aud_bin =
  Finch.build(
    :get,
    Path.join(url <> path, aud)
    |> String.replace(~r/\.\//, "")
  )
  |> Finch.request!(Vyasa.Finch)
  |> Map.get(:body)
# # IO.binwrite(path, aud)
# # |> :file.read_file_info()
# File.open(Path.expand([path,aud], "media"), [:write, :binary])
# # |> IO.binwrite(aud_bin)
# Path.expand([path,aud], "media")
# # |> File.touch!()
File.mkdir_p!(Path.expand(path, "media"))
{:ok, file} = File.write!(Path.expand(path, "media"), aud_bin)

Shlokam

url = "https://shlokam.org/"
path = "hanumanchalisa"

col =
  Finch.build(:get, url <> path)
  |> Finch.request!(Vyasa.Finch)
  |> Map.get(:body)
  |> Floki.parse_document!()
  |> Floki.find(".uncode_text_column")
# class_key = %{
#   "verse_meaning" => "en_translation",
# "verse_sanskrit" => "text",
# "verse_trans" => "transliteration",
# }
output =
  col
  |> Enum.reduce(%{title: nil, description: nil, verses: []}, fn
    {"div", _, [{"h3", [], ["Description"]} | para]}, acc ->
      # IO.inspect(rem, label: "div")
      desc =
        para
        |> Floki.text()

      %{acc | description: desc}

    {"div", _, [{"h3", _, _} = h3_tree]}, acc ->
      title =
        h3_tree
        |> Floki.text()

      %{acc | title: title}

    {"div", _, [{"div", [{"class", "verse_sanskrit"}], _verse} | _] = verse_tree}, acc ->
      [curr | [%{"count" => count} | _] = verses] =
        Enum.reduce(verse_tree, [], fn
          # n case verse break
          {"hr", [{"class", "verse_separator"}], []}, [curr | [%{"count" => c} | _] = acc] ->
            [Map.put(curr, "count", c + 1) | acc]

          # init verse break
          {"hr", [{"class", "verse_separator"}], []}, [curr | acc] ->
            [Map.put(curr, "count", 1) | acc]

          # n case after verse break
          {"div", [{"class", class}], _} = c_tree, [%{"count" => _} | _] = acc ->
            [%{class => c_tree |> Floki.text()} | acc]

          # n case before verse break
          {"div", [{"class", class}], _} = c_tree, [curr | acc] when is_map(curr) ->
            [Map.put(curr, class, c_tree |> Floki.text()) | acc]

          # init
          {"div", [{"class", class}], _} = c_tree, [] ->
            [%{class => c_tree |> Floki.text()}]

          others, acc ->
            IO.inspect(others)
            acc
        end)

      # formatting & tying loose ends
      clean_verses =
        [Map.put(curr, "count", count + 1) | verses]
        |> Enum.reverse()

      %{acc | verses: clean_verses}

    _, acc ->
      acc
  end)
contents = Poison.encode!(output)
# File.write!("chalisa_scraped.json", contents)
filename = "chalisa_scraped.json"
url = "./Projects/vyasa/scripts/#{filename}"
File.write!(url, contents)
IO.puts(url)
Floki.traverse_and_update(html, fn
  {"div", _, _} = node -> {"div", [], ["Modified Div"]}
  node -> node
end)

text = find_element(:class, "verse_sanskrit") |> inner_text()
transliteration = find_element(:class, "verse_trans") |> inner_text()
en_translation = find_element(:class, "verse_meaning") |> inner_text()

map = %{
  verse_number: count,
  text: text,
  transliteration: transliteration,
  en_translation: en_translation
}

Gita Events

gita = Vyasa.Written.get_source_by_title("Gita")
verses = Vyasa.Written.get_verses_in_chapter(1, gita.id)
verse_lookup = Enum.into(for(%{id: id, no: verse_no} <- verses, do: {verse_no, id}), %{})

c1_path = Path.expand("./1.mp3", "media/gita")

{:ok,
 %Vyasa.Parser.MP3{
   duration: tot_d,
   path: p
 }} = Vyasa.Parser.MP3.parse(c1_path)

{:ok, voice} =
  Vyasa.Medium.create_voice(%{lang: "sa", duration: tot_d, file_path: c1_path, source_id: gita.id})

# Vyasa.Medium.get_voice!("4c73fb6d-4163-4b64-90d0-5d49680c1ee4")
# |> Vyasa.Medium.delete_voice()
"""
start :- 00:00
Shloka 1:-    00:33
Shloka 2 :-   00:49
Shloka 3:-    01:06
Shloka 4:-   01:19
Shloka 5:-   01:32 
Shloka 6:-   01:46
Shloka 7:-   02:00
Shloka 8:-   02:15
Shloka 9:-   02:28
Shloka 10:-  02:42
Shloka 11:-  02:56
Shloka 12:- 03:09
Shloka 13:- 03:22
Shloka 14:- 03:36
Shloka 15:- 03:49
Shloka 16:- 04:02
Shloka 17:- 04:14
Shloka 18:- 04:27
Shloka 19:- 04:40
Shloka 20:- 04:54
Shloka 21:-  05:07
Shloka 22:-  05:23
Shloka 23:-  05:36
Shloka 24:-  05:50
Shloka 25:- 06:05
Shloka 26:- 06:18
Shloka 27:- 06:32
Shloka 28:- 06:46
Shloka 29:-  07:01
Shloka 30:-  07:13
Shloka 31:-  07:26
Shloka 32 :-  07:38
Shloka 33:-   07:52
Shloka 34 :-  08:05
Shloka 35 :-  08:18
Shloka 36 :- 08:31
Shloka 37:-  08:44
Shloka 38 :- 08:57
Shloka 39:-  09:09
Shloka 40:-  09:22
Shloka 41:- 09:35
Shloka 42:- 09:48
Shloka 43:-  10:02
Shloka 44:-  10:16
Shloka 45:-  10:29
Shloka 46:-  10:40
Shloka 47:- 10:53
end:-  11:08
"""
|> String.split("\n")
|> Enum.map(fn x ->
  x
  |> String.split(":-")
  |> Enum.map(&amp;String.trim/1)
  |> Enum.reduce([], fn
    <<"Shloka"::utf8, sep::utf8, verse_no::binary>>, acc ->
      [verse_lookup[String.to_integer(verse_no)] | acc]

    bin, acc ->
      [bin | acc]
  end)
end)
|> IO.inspect(limit: :infinity)
|> Enum.reduce(
  [],
  fn
    [time, "start"], acc ->
      [%Vyasa.Medium.Event{origin: 0, phase: "start", voice_id: voice.id} | acc]

    [time, "end"], [%{origin: o} = prev | acc] ->
      [min, sec] = time |> String.split(":") |> Enum.map(&amp;String.to_integer/1)
      d = (min * 60 + sec) * 1000

      [
        %Vyasa.Medium.Event{origin: d, duration: tot_d - d, phase: "end", voice_id: voice.id}
        | [%{prev | duration: d - o} | acc]
      ]

    [time, id], [%{origin: o} = prev | acc] ->
      [min, sec] = time |> String.split(":") |> Enum.map(&amp;String.to_integer/1)
      d = (min * 60 + sec) * 1000

      [
        %Vyasa.Medium.Event{origin: d, verse_id: id, voice_id: voice.id}
        | [%{prev | duration: d - o} | acc]
      ]

    _, acc ->
      acc
  end
)
|> Enum.map(&amp;Vyasa.Medium.create_event(&amp;1))