getml.data.columns

Handlers for 1-d arrays storing the data of an individual variable.

Like the DataFrame, the columns do not contain any actual data themselves but are only handlers to objects within the getML Engine. These containers store data of a single variable in a one-dimensional array of a uniform type.

Columns are immutable and lazily evaluated.

Immutable means that there are no in-place operation on the columns. Any change to the column will return a new, changed column.
Lazy evaluation means that operations won't be executed until results are required. This is reflected in the column views: Column views do not exist until they are required.

Example

This is what some column operations might look like:

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["column_01"]

# ----------------

# col2 is a column view.
# The operation is not executed yet.
col2 = 2.0 - col1

# This is when '2.0 - col1' is actually
# executed.
my_df["column_02"] = col2
my_df.set_role("column_02", roles.numerical)

# If you want to update column_01,
# you can't do that in-place.
# You need to replace it with a new column
col1 = col1 + col2
my_df["column_01"] = col1
my_df.set_role("column_01", roles.numerical)

BooleanColumnView

BooleanColumnView(
    operator: str,
    operand1: Optional[OperandType],
    operand2: Optional[OperandType],
)

Bases: _View

Handle for a lazily evaluated boolean column view.

Column views do not actually exist - they will be lazily evaluated when necessary.

They can be used to take subselection of the data frame or to update other columns.

Example

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

names = my_df["names"]

# This is a virtual boolean column.
a_or_p_in_names = names.contains("p") | names.contains("a")

# Creates a view containing
# only those entries, where "names" contains a or p.
my_view = my_df[a_or_p_in_names]

# ----------------

# Returns a new column, where all names
# containing "rick" are replaced by "Patrick".
# Again, columns are immutable - this returns an updated
# version, but leaves the original column unchanged.
new_names = names.update(names.contains("rick"), "Patrick")

my_df["new_names"] = new_names

# ----------------

# Boolean columns can also be used to
# create binary target variables.
target = (names == "phil")

my_df["target"] = target
my_df.set_role(target, roles.target)

# By the way, instead of using the
# __setitem__ operator and .set_role(...)
# you can just use .add(...).
my_df.add(target, "target", roles.target)

Source code in getml/data/columns/columns.py

def __init__(
    self,
    operator: str,
    operand1: Optional[OperandType],
    operand2: Optional[OperandType],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = BOOLEAN_COLUMN_VIEW

    self.cmd["operator_"] = operator

    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)

    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

is_false

is_false()

Whether an entry is False - effectively inverts the Boolean column.

Source code in getml/data/columns/columns.py

def is_false(self):
    """Whether an entry is False - effectively inverts the Boolean column."""
    return BooleanColumnView(
        operator="not",
        operand1=self,
        operand2=None,
    )

as_num

as_num()

Transforms the boolean column into a numerical column

Source code in getml/data/columns/columns.py

def as_num(self):
    """Transforms the boolean column into a numerical column"""
    return FloatColumnView(
        operator="boolean_as_num",
        operand1=self,
        operand2=None,
    )

FloatColumn

FloatColumn(
    name: str = "",
    role: str = "numerical",
    df_name: str = "",
)

Bases: _Column

Handle for numerical data in the Engine.

This is a handler for all numerical data in the getML Engine, including time stamps.

ATTRIBUTE	DESCRIPTION
`name`	Name of the categorical column.
`role`	Role that the column plays.
`df_name`	`name` instance variable of the `DataFrame` containing this column.

Example

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["column_01"]

# ----------------

col2 = 2.0 - col1

my_df.add(col2, "name", roles.numerical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_float.

col3 = (col1 + 2.0*col2) / 3.0

my_df["column_03"] = col3
my_df.set_role("column_03", roles.numerical)

Source code in getml/data/columns/columns.py

def __init__(self, name: str = "", role: str = "numerical", df_name: str = ""):
    super().__init__()

    FloatColumn._num_columns += 1
    if name == "":
        name = FLOAT_COLUMN + " " + str(FloatColumn._num_columns)

    self.cmd: Dict[str, Any] = {}

    self.cmd["operator_"] = FLOAT_COLUMN

    self.cmd["df_name_"] = df_name

    self.cmd["name_"] = name

    self.cmd["role_"] = role

    self.cmd["type_"] = FLOAT_COLUMN

FloatColumnView

FloatColumnView(
    operator: str,
    operand1: Optional[FloatOperandType],
    operand2: Optional[FloatOperandType],
)

Bases: _View

Lazily evaluated view on a FloatColumn.

Column views do not actually exist - they will be lazily evaluated when necessary.

Source code in getml/data/columns/columns.py

def __init__(
    self,
    operator: str,
    operand1: Optional[FloatOperandType],
    operand2: Optional[FloatOperandType],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = FLOAT_COLUMN_VIEW

    self.cmd["operator_"] = operator

    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)

    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

StringColumn

StringColumn(
    name: str = "",
    role: str = "categorical",
    df_name: str = "",
)

Bases: _Column

Handle for categorical data that is kept in the getML Engine

ATTRIBUTE	DESCRIPTION
`name`	Name of the categorical column.
`role`	Role that the column plays.
`df_name`	`name` instance variable of the `DataFrame` containing this column.

Example

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["names"]

# ----------------

col2 = col1.substr(4, 3)

my_df.add(col2, "short_names", roles.categorical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_string.

col3 = "user-" + col1 + "-" + col2

my_df["new_names"] = col3
my_df.set_role("new_names", roles.categorical)

Source code in getml/data/columns/columns.py

def __init__(self, name: str = "", role: str = "categorical", df_name: str = ""):
    super().__init__()

    StringColumn._num_columns += 1
    if name == "":
        name = STRING_COLUMN + " " + str(StringColumn._num_columns)

    self.cmd: Dict[str, Any] = {}

    self.cmd["operator_"] = STRING_COLUMN
    self.cmd["df_name_"] = df_name
    self.cmd["name_"] = name
    self.cmd["role_"] = role
    self.cmd["type_"] = STRING_COLUMN

StringColumnView

StringColumnView(
    operator: str,
    operand1: Optional[Union[str, _Column, _View]],
    operand2: Optional[Union[str, _Column, _View]],
)

Bases: _View

Lazily evaluated view on a StringColumn.

Columns views do not actually exist - they will be lazily evaluated when necessary.

Example

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["names"]

# ----------------

# col2 is a virtual column.
# The substring operation is not
# executed yet.
col2 = col1.substr(4, 3)

# This is where the Engine executes
# the substring operation.
my_df.add(col2, "short_names", roles.categorical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_string.

# col3 is a virtual column.
# The operation is not
# executed yet.
col3 = "user-" + col1 + "-" + col2

# This is where the operation is
# is executed.
my_df["new_names"] = col3
my_df.set_role("new_names", roles.categorical)

Source code in getml/data/columns/columns.py

def __init__(
    self,
    operator: str,
    operand1: Optional[Union[str, _Column, _View]],
    operand2: Optional[Union[str, _Column, _View]],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = STRING_COLUMN_VIEW
    self.cmd["operator_"] = operator
    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)
    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

arange

arange(
    start: Union[Real, float] = 0.0,
    stop: Optional[Union[Real, float]] = None,
    step: Union[Real, float] = 1.0,
)

Returns evenly spaced variables, within a given interval.

PARAMETER	DESCRIPTION
`start`	The beginning of the interval. Defaults to 0. TYPE: `Union[Real, float]` DEFAULT: `0.0`
`stop`	The end of the interval. TYPE: `Optional[Union[Real, float]]` DEFAULT: `None`
`step`	The step taken. Defaults to 1. TYPE: `Union[Real, float]` DEFAULT: `1.0`

Source code in getml/data/columns/columns.py

def arange(
    start: Union[numbers.Real, float] = 0.0,
    stop: Optional[Union[numbers.Real, float]] = None,
    step: Union[numbers.Real, float] = 1.0,
):
    """
    Returns evenly spaced variables, within a given interval.

    Args:
        start:
            The beginning of the interval. Defaults to 0.

        stop:
            The end of the interval.

        step:
            The step taken. Defaults to 1.
    """
    if stop is None:
        stop = start
        start = 0.0

    if step is None:
        step = 1.0

    if not isinstance(start, numbers.Real):
        raise TypeError("'start' must be a real number")

    if not isinstance(stop, numbers.Real):
        raise TypeError("'stop' must be a real number")

    if not isinstance(step, numbers.Real):
        raise TypeError("'step' must be a real number")

    col = FloatColumnView(
        operator="arange",
        operand1=None,
        operand2=None,
    )

    col.cmd["start_"] = float(start)
    col.cmd["stop_"] = float(stop)
    col.cmd["step_"] = float(step)

    return col

rowid

rowid() -> FloatColumnView

Get the row numbers of the table.

RETURNS	DESCRIPTION
`FloatColumnView`	(numerical) column containing the row id, starting with 0

Source code in getml/data/columns/columns.py

def rowid() -> FloatColumnView:
    """
    Get the row numbers of the table.

    Returns:
            (numerical) column containing the row id, starting with 0
    """
    return FloatColumnView(operator="rowid", operand1=None, operand2=None)

collect_footer_data

Collects the data necessary for displaying the column footer.

Bases: NamedTuple

Contains the data to be shown in the footer of the data frame or column.

aggregation

Lazily evaluated aggregation over a column.

Aggregation

Aggregation(alias, col, agg_type)

Lazily evaluated aggregation over a column.

Example

my_data_frame["my_column"].avg()
3.0

Source code in getml/data/columns/aggregation.py

def __init__(self, alias, col, agg_type):
    self.cmd: Dict[str, Any] = {}
    self.cmd["as_"] = alias
    self.cmd["col_"] = col.cmd
    self.cmd["type_"] = agg_type

get

get()

Receives the value of the aggregation over the column.

Source code in getml/data/columns/aggregation.py

def get(self):
    """
    Receives the value of the aggregation over the column.
    """

    cmd: Dict[str, Any] = {}

    cmd["name_"] = ""
    cmd["type_"] = "FloatColumn.aggregate"

    cmd["aggregation_"] = self.cmd

    with comm.send_and_get_socket(cmd) as sock:
        msg = comm.recv_string(sock)
        if msg != "Success!":
            comm.handle_engine_exception(msg)
        mat = comm.recv_float_matrix(sock)

    return mat.ravel()[0]

format

Format the column

last_change

Returns the last time a data frame has been changed.

last_change_from_col

The last time any of the underlying data frames has been changed.

length

Returns the length of the column

length_property

The length of the column (number of rows in the data frame).

make_iter

Factory function for a function that can be used to iterate through a column.

parse

Parses the columns from a cmd

repr

ASCII representation of the column.

repr_html

HTML representation of the column.

subroles

The subroles of this column.

to_arrow

Transform column to a pyarrow.ChunkedArray

to_numpy

Transform column to a numpy array.

unique

Transform column to numpy array containing unique values

unit

The unit of this column.

from_value

from_value(
    val: Union[bool, str, int, float, datetime64]
) -> ReturnType

Creates an infinite column that contains the same value in all of its elements.

PARAMETER	DESCRIPTION
`val`	The value you want to insert into your column. TYPE: `Union[bool, str, int, float, datetime64]`

RETURNS	DESCRIPTION
`ReturnType`	The column view containing the value.

Source code in getml/data/columns/from_value.py

def from_value(val: Union[bool, str, int, float, np.datetime64]) -> ReturnType:
    """
    Creates an infinite column that contains the same
    value in all of its elements.

    Args:
        val:
            The value you want to insert into your column.

    Returns:
        The column view containing the value.
    """
    cmd = _value_to_cmd(val)

    if isinstance(val, bool):
        col: ReturnType = BooleanColumnView(
            operator="const",
            operand1=None,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, str):
        col = StringColumnView(
            operator="const",
            operand1=val,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, (int, float, numbers.Number)):
        col = FloatColumnView(
            operator="const",
            operand1=val,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, np.datetime64):
        col = FloatColumnView(
            operator="const",
            operand1=np.datetime64(val, "s").astype(float),
            operand2=None,
        )
        col.cmd = cmd
        return col

    raise TypeError("val must be bool, str or a number.")

random

random(seed: int = 5849) -> FloatColumnView

Create random column.

The numbers will be uniformly distributed from 0.0 to 1.0. This can be used to randomly split a population table into a training and a test set

PARAMETER	DESCRIPTION
`seed`	Seed used for the random number generator. TYPE: `int` DEFAULT: `5849`

RETURNS	DESCRIPTION
`FloatColumnView`	FloatColumn containing random numbers

Example

population = getml.DataFrame('population')
population.add(numpy.zeros(100), 'column_01')

idx = random(seed=42)
population_train = population[idx > 0.7]
population_test = population[idx <= 0.7]

Source code in getml/data/columns/random.py

def random(seed: int = 5849) -> FloatColumnView:
    """
    Create random column.

    The numbers will be uniformly distributed from 0.0 to 1.0. This can be
    used to randomly split a population table into a training and a test
    set

    Args:
        seed:
            Seed used for the random number generator.

    Returns:
            FloatColumn containing random numbers

    ??? example
        ```python
        population = getml.DataFrame('population')
        population.add(numpy.zeros(100), 'column_01')

        idx = random(seed=42)
        population_train = population[idx > 0.7]
        population_test = population[idx <= 0.7]
        ```
    """

    if not isinstance(seed, numbers.Real):
        raise TypeError("'seed' must be a real number")

    col = FloatColumnView(operator="random", operand1=None, operand2=None)
    col.cmd["seed_"] = seed
    return col

getml.data.columns

BooleanColumnView

is_false

as_num

FloatColumn

FloatColumnView

StringColumn

StringColumnView

arange

rowid

collect_footer_data

Footer

aggregation

Aggregation

get

format

last_change

last_change_from_col

length

length_property

make_iter

parse

repr

repr_html

subroles

to_arrow

to_numpy

unique

unit

from_value

random