Powered by AppSignal & Oban Pro
Would you like to see your link here? Contact us
Notesclub

Lexin Parser

experiments/parser.livemd

Lexin Parser

Intro

We are trying to implement a parser for Lexin service output.

Mix.install([
  {:tesla, "~> 1.4"},
  {:hackney, "~> 1.17"},
  {:jason, "~> 1.2"},
  {:kino, "~> 0.3"}
])

Client

defmodule LexinClient do
  use Tesla

  plug(Tesla.Middleware.BaseUrl, "https://lexin.nada.kth.se/lexin")
  plug(Tesla.Middleware.JSON)

  def definition(word) do
    params = [searchinfo: "to,swe_rus,#{word}", output: "JSON"]

    {:ok, %{body: defn}} = get("/service", query: params)

    defn
  end
end

Simplest Request

LexinClient.definition("vatten")

Type System

It’s an attempt to understand what is the structure of Lexin backend response. We might be missing some types of fields.

With enforced_keys we are trying to highlight which fields are mandatory for the given record type; which fieldswe expect to see in the record.

For example, Lexin.Definition.Lang fields can be in both: as "BaseLang" keys, and as "TargetLang" keys. Sometimes, "TargetLang" has only single "Translation", while "BaseLang" doesn’t have this field at all.

In the comments next to defstruct definitions we put the key name (and a value form) from the original JSON-response.

defmodule Lexin.Definition.Illustration do
  @enforce_keys [:type, :url]
  defstruct [
    # Type "picture" (not sure if other types exist)
    :type,
    # Value "https://..."
    :url
  ]

  @type t :: %__MODULE__{
          type: String.t(),
          url: String.t()
        }
end

defmodule Lexin.Definition.Phonetic do
  @enforce_keys [:transcription, :audio_url]
  defstruct [
    # Content "..."
    :transcription,
    # File "http://..."
    :audio_url
  ]

  @type t :: %__MODULE__{
          transcription: String.t(),
          audio_url: String.t()
        }
end

defmodule Lexin.Definition.Content do
  @enforce_keys [:value]
  defstruct [
    # ID "..." (internal for the record id; for corresponding translation)
    :id,
    # Content "..."
    :value,
    # Inflection ["...", ...] (not the same format as in upper level inflections)
    :inflections
  ]

  @type t :: %__MODULE__{
          id: non_neg_integer() | nil,
          value: String.t(),
          inflections: [Strint.t()]
        }
end

defmodule Lexin.Definition.Lang do
  defstruct [
    # Meaning "..."
    :meaning,
    # Comment "..."
    :comment,
    # Translation "..."
    :translation,
    # Alternate "..."
    :alternate,
    # Phonetic %{}
    :phonetic,
    # Inflection [%{}, ...]
    :inflections,
    # Example [%{}, ...]
    :examples,
    # Idiom [%{}, ...]
    :idioms,
    # Compound [%{}, ...]
    :compounds,
    # Illustration [%{}, ...]
    :illustrations,
    # Antonym ["...", ...]
    :antonyms,
    # Synonym, ["...", ...]
    :synonyms
  ]

  @type t :: %__MODULE__{
          meaning: String.t() | nil,
          comment: String.t() | nil,
          translation: String.t() | nil,
          alternate: String.t() | nil,
          phonetic: Lexin.Definition.Phonetic.t() | nil,
          inflections: [Lexin.Definition.Content.t()],
          examples: [Lexin.Definition.Content.t()],
          idioms: [Lexin.Definition.Content.t()],
          compounds: [Lexin.Definition.Content.t()],
          illustrations: [Lexin.Definition.Illustration.t()],
          antonyms: [String.t()],
          synonyms: [String.t()]
        }
end

defmodule Lexin.Definition do
  @enforce_keys [:id, :pos, :value, :base, :target]
  defstruct [
    # VariantID "..."
    :id,
    # Type "..." (part of speech)
    :pos,
    # Value "..."
    :value,
    # BaseLang %{}
    :base,
    # TargetLang %{}
    :target
  ]

  @type t :: %__MODULE__{
          id: non_neg_integer(),
          pos: String.t(),
          value: String.t(),
          base: Lexin.Definition.Lang.t(),
          target: Lexin.Definition.Lang.t()
        }
end

Parsers

defmodule Lexin.Parser do
  def run(word) do
    %{"Result" => definitions} = LexinClient.definition(word)

    definitions
    |> Enum.map(&parse_definition/1)
  end

  defp parse_definition(raw) do
    %Lexin.Definition{
      id: parse_integer(raw["VariantID"]),
      pos: raw["Type"],
      value: raw["Value"],
      base: parse_lang(raw["BaseLang"]),
      target: parse_lang(raw["TargetLang"])
    }
  end

  defp parse_lang(raw) do
    %Lexin.Definition.Lang{
      meaning: raw["Meaning"],
      comment: raw["Comment"],
      translation: raw["Translation"],
      alternate: raw["Alternate"],
      phonetic: parse_phonetic(raw["Phonetic"]),
      inflections: parse_contents(raw["Inflection"]),
      examples: parse_contents(raw["Example"]),
      idioms: parse_contents(raw["Idiom"]),
      compounds: parse_contents(raw["Compound"]),
      illustrations: parse_illustrations(raw["Illustration"]),
      antonyms: parse_strings(raw["Antonym"]),
      synonyms: parse_strings(raw["Synonym"])
    }
  end

  defp parse_phonetic(nil), do: nil

  defp parse_phonetic(raw) do
    %Lexin.Definition.Phonetic{
      transcription: raw["Content"],
      audio_url: raw["File"]
    }
  end

  defp parse_contents(nil), do: []

  defp parse_contents(raws) do
    raws
    |> Enum.map(fn raw ->
      %Lexin.Definition.Content{
        id: parse_integer(raw["ID"]),
        inflections: parse_strings(raw["Inflection"]),
        value: raw["Content"]
      }
    end)
  end

  defp parse_illustrations(nil), do: []

  defp parse_illustrations(raws) do
    raws
    |> Enum.map(fn raw ->
      %Lexin.Definition.Illustration{
        type: raw["Type"],
        url: raw["Value"]
      }
    end)
  end

  defp parse_integer(nil), do: nil
  defp parse_integer(""), do: nil
  defp parse_integer(n), do: String.to_integer(n)

  defp parse_strings(nil), do: []
  defp parse_strings([""]), do: []
  defp parse_strings(list), do: list
end

Lexin.Parser.run("katt")