Basic DataFrame Manipulation
Mix.install(
[
{:kino_explorer, "~> 0.1.20"},
{:explorer, "~> 0.10.0"},
{:httpoison, "~> 1.8"},
{:jason, "~> 1.4"},
{:vega_lite, "~> 0.1.7"},
{:exla, ">= 0.0.0"},
{:kino_vega_lite, "~> 0.1.11"}
],
config: [nx: [default_backend: EXLA.Backend]]
)
Section
Basic DataFrame Manipulation
Let’s dive into a simple data manipulation task. In Python, you might double the values in a column with df["feature"] = df["feature"] * 2
. Here’s how we achieve the same in Elixir:
alias Explorer.DataFrame
alias Explorer.Series
# Ensure all macros and functions in Explorer.DataFrame are available
require DataFrame
# Sample data frame creation
data = %{
feature: [1, 2, 3, 4, 5]
}
# Creating the DataFrame
df = DataFrame.new(data)
# Accessing the "feature" column and doubling its values
# feature_series = DataFrame.pull(df, "feature")
# doubled_feature = Series.multiply(feature_series, 2)
# Simpler style in elixir, but equivalent to the above
doubled_feature = DataFrame.pull(df, "feature")
|> Series.multiply(2)
# Creating a new DataFrame with the updated values
df = DataFrame.put(df, "feature", doubled_feature)
df
Advanced Data Processing
Handling missing values is a common data science task.
i.e. s.interpolate(method='linear')
in python.
Let’s perform linear interpolation, a bit more involved in Elixir:
defmodule DataFrameProcessor do
require Logger
alias Explorer.DataFrame
alias Explorer.Series
# Define a function to interpolate values in a Series
def interpolate(series) do
values = Series.to_list(series)
Logger.info("Original series: #{inspect(values)}")
interpolated = Enum.reduce_while(values, [], fn value, acc ->
Logger.info("Current value: #{inspect(value)}, Accumulated: #{inspect(acc)}")
case value do
nil ->
last_value = List.last(acc) || 0.0
index = length(acc)
Logger.info("Interpolating. Last known: #{inspect(last_value)}, Index: #{inspect(index)}")
rest = Enum.drop(values, index + 1)
next_value = Enum.find(rest, fn x -> not is_nil(x) end)
interpolated_value =
if not is_nil(next_value) do
(last_value + next_value) / 2
else
last_value
end
Logger.info("Interpolated value: #{inspect(interpolated_value)}")
{:cont, acc ++ [interpolated_value]}
_ ->
{:cont, acc ++ [value]}
end
end)
Logger.info("Interpolated series: #{inspect(interpolated)}")
Series.from_list(interpolated)
end
def process_and_join(date_list, feature1, feature2) do
data_df1 = %{
"feature1" => feature1,
"date" => date_list
}
data_df2 = %{
"feature2" => feature2,
"date" => date_list
}
df1 = DataFrame.new(data_df1)
df2 = DataFrame.new(data_df2)
Logger.info("Initial DataFrame df1: #{inspect(df1)}")
Logger.info("Initial DataFrame df2: #{inspect(df2)}")
df1 = DataFrame.put(df1, "feature1", interpolate(DataFrame.pull(df1, "feature1")))
df2 = DataFrame.put(df2, "feature2", interpolate(DataFrame.pull(df2, "feature2")))
Logger.info("DataFrame df1 after interpolation: #{inspect(df1)}")
Logger.info("DataFrame df2 after interpolation: #{inspect(df2)}")
joined_df = DataFrame.join(df1, df2, on: "date", how: :left)
Logger.info("Joined DataFrame: #{inspect(joined_df)}")
joined_df
end
end
# Sample data for testing
date_list = [
~D[2023-01-01],
~D[2023-01-02],
~D[2023-01-03],
~D[2023-01-04],
~D[2023-01-05]
]
feature1 = [1.0, nil, 3.0, nil, 5.0]
feature2 = [nil, 8.0, nil, 16.0, 20.0]
# Execute the processor with sample data
DataFrameProcessor.process_and_join(date_list, feature1, feature2)
Calculating Moving Averages Moving averages offer insights into data trends.
In python/pandas, again one liner,
df['feature'].rolling(9).mean()
.
Thankfully, there’s some support in Series in Elixir. Check this link. Here’s how to calculate them in Elixir:
defmodule TemperatureAnalysis do
alias Explorer.DataFrame
alias Explorer.Series
def moving_average(date_list, temperature_list, window_size, weights \\ []) do
# Create a dataframe with date and temperature
data = %{
"date" => date_list,
"temperature" => temperature_list
}
df = DataFrame.new(data)
# Pull the temperature series and calculate the moving average with the provided weights
temperature_series = DataFrame.pull(df, "temperature")
# Calculate the moving average using the provided weights
# If the weights are not provided, equal weighting is assumed
moving_avg_series =
Series.window_mean(temperature_series, window_size, weights: weights, min_periods: 1)
# Add the moving_average column back to the dataframe
df_with_moving_avg = DataFrame.put(df, "moving_average", moving_avg_series)
# Log or return the DataFrame with the moving average
IO.inspect(df_with_moving_avg, label: "DataFrame with Moving Average")
df_with_moving_avg
end
end
defmodule DateGenerator do
# Function to generate a list of dates from a start date to a specified number of days
def generate_dates(start_date, days) do
Enum.map(0..(days-1), fn offset -> Date.add(start_date, offset) end)
end
end
# Sample Input
date_list = DateGenerator.generate_dates(~D[2023-01-01], 21)
# Generate dynamic temperature values using a sinusoidal function with noise
temperature_list = Enum.map(1..21, fn day ->
10.0 * :math.sin(day / 2) + Enum.random(-2..2) + 20
end)
window_size = 3
# Execute the moving average calculation
temperature = TemperatureAnalysis.moving_average(date_list, temperature_list, window_size, nil)
Visualization is apparently week compared to python. But it provides the basics.
df.plot()
VegaLite.new(width: 1080, title: "case study")
|> VegaLite.data_from_values(temperature, only: ["date", "moving_average", "temperature"])
|> VegaLite.layers([
VegaLite.new()
|> VegaLite.mark(:line)
|> VegaLite.encode_field(:x, "date", type: :temporal)
|> VegaLite.encode_field(:y, "moving_average", type: :quantitative),
VegaLite.new()
|> VegaLite.mark(:point)
|> VegaLite.encode_field(:x, "date", type: :temporal)
|> VegaLite.encode_field(:y, "temperature", type: :quantitative)
])