Skip to content

getml.data.columns

Handlers for 1-d arrays storing the data of an individual variable.

Like the DataFrame, the columns do not contain any actual data themselves but are only handlers to objects within the getML Engine. These containers store data of a single variable in a one-dimensional array of a uniform type.

Columns are immutable and lazily evaluated.

  • Immutable means that there are no in-place operation on the columns. Any change to the column will return a new, changed column.

  • Lazy evaluation means that operations won't be executed until results are required. This is reflected in the column views: Column views do not exist until they are required.

Example

This is what some column operations might look like:

import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["column_01"]

# ----------------

# col2 is a column view.
# The operation is not executed yet.
col2 = 2.0 - col1

# This is when '2.0 - col1' is actually
# executed.
my_df["column_02"] = col2
my_df.set_role("column_02", roles.numerical)

# If you want to update column_01,
# you can't do that in-place.
# You need to replace it with a new column
col1 = col1 + col2
my_df["column_01"] = col1
my_df.set_role("column_01", roles.numerical)

BooleanColumnView

BooleanColumnView(
    operator: str,
    operand1: Optional[OperandType],
    operand2: Optional[OperandType],
)

Bases: _View

Handle for a lazily evaluated boolean column view.

Column views do not actually exist - they will be lazily evaluated when necessary.

They can be used to take subselection of the data frame or to update other columns.

Example
import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

names = my_df["names"]

# This is a virtual boolean column.
a_or_p_in_names = names.contains("p") | names.contains("a")

# Creates a view containing
# only those entries, where "names" contains a or p.
my_view = my_df[a_or_p_in_names]

# ----------------

# Returns a new column, where all names
# containing "rick" are replaced by "Patrick".
# Again, columns are immutable - this returns an updated
# version, but leaves the original column unchanged.
new_names = names.update(names.contains("rick"), "Patrick")

my_df["new_names"] = new_names

# ----------------

# Boolean columns can also be used to
# create binary target variables.
target = (names == "phil")

my_df["target"] = target
my_df.set_role(target, roles.target)

# By the way, instead of using the
# __setitem__ operator and .set_role(...)
# you can just use .add(...).
my_df.add(target, "target", roles.target)
Source code in getml/data/columns/columns.py
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def __init__(
    self,
    operator: str,
    operand1: Optional[OperandType],
    operand2: Optional[OperandType],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = BOOLEAN_COLUMN_VIEW

    self.cmd["operator_"] = operator

    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)

    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

is_false

is_false()

Whether an entry is False - effectively inverts the Boolean column.

Source code in getml/data/columns/columns.py
388
389
390
391
392
393
394
def is_false(self):
    """Whether an entry is False - effectively inverts the Boolean column."""
    return BooleanColumnView(
        operator="not",
        operand1=self,
        operand2=None,
    )

as_num

as_num()

Transforms the boolean column into a numerical column

Source code in getml/data/columns/columns.py
398
399
400
401
402
403
404
def as_num(self):
    """Transforms the boolean column into a numerical column"""
    return FloatColumnView(
        operator="boolean_as_num",
        operand1=self,
        operand2=None,
    )

FloatColumn

FloatColumn(
    name: str = "",
    role: str = "numerical",
    df_name: str = "",
)

Bases: _Column

Handle for numerical data in the Engine.

This is a handler for all numerical data in the getML Engine, including time stamps.

ATTRIBUTE DESCRIPTION
name

Name of the categorical column.

role

Role that the column plays.

df_name

name instance variable of the DataFrame containing this column.

Example
import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["column_01"]

# ----------------

col2 = 2.0 - col1

my_df.add(col2, "name", roles.numerical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_float.

col3 = (col1 + 2.0*col2) / 3.0

my_df["column_03"] = col3
my_df.set_role("column_03", roles.numerical)
Source code in getml/data/columns/columns.py
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
def __init__(self, name: str = "", role: str = "numerical", df_name: str = ""):
    super().__init__()

    FloatColumn._num_columns += 1
    if name == "":
        name = FLOAT_COLUMN + " " + str(FloatColumn._num_columns)

    self.cmd: Dict[str, Any] = {}

    self.cmd["operator_"] = FLOAT_COLUMN

    self.cmd["df_name_"] = df_name

    self.cmd["name_"] = name

    self.cmd["role_"] = role

    self.cmd["type_"] = FLOAT_COLUMN

FloatColumnView

FloatColumnView(
    operator: str,
    operand1: Optional[FloatOperandType],
    operand2: Optional[FloatOperandType],
)

Bases: _View

Lazily evaluated view on a FloatColumn.

Column views do not actually exist - they will be lazily evaluated when necessary.

Source code in getml/data/columns/columns.py
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
def __init__(
    self,
    operator: str,
    operand1: Optional[FloatOperandType],
    operand2: Optional[FloatOperandType],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = FLOAT_COLUMN_VIEW

    self.cmd["operator_"] = operator

    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)

    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

StringColumn

StringColumn(
    name: str = "",
    role: str = "categorical",
    df_name: str = "",
)

Bases: _Column

Handle for categorical data that is kept in the getML Engine

ATTRIBUTE DESCRIPTION
name

Name of the categorical column.

role

Role that the column plays.

df_name

name instance variable of the DataFrame containing this column.

Example
import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["names"]

# ----------------

col2 = col1.substr(4, 3)

my_df.add(col2, "short_names", roles.categorical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_string.

col3 = "user-" + col1 + "-" + col2

my_df["new_names"] = col3
my_df.set_role("new_names", roles.categorical)
Source code in getml/data/columns/columns.py
479
480
481
482
483
484
485
486
487
488
489
490
491
492
def __init__(self, name: str = "", role: str = "categorical", df_name: str = ""):
    super().__init__()

    StringColumn._num_columns += 1
    if name == "":
        name = STRING_COLUMN + " " + str(StringColumn._num_columns)

    self.cmd: Dict[str, Any] = {}

    self.cmd["operator_"] = STRING_COLUMN
    self.cmd["df_name_"] = df_name
    self.cmd["name_"] = name
    self.cmd["role_"] = role
    self.cmd["type_"] = STRING_COLUMN

StringColumnView

StringColumnView(
    operator: str,
    operand1: Optional[Union[str, _Column, _View]],
    operand2: Optional[Union[str, _Column, _View]],
)

Bases: _View

Lazily evaluated view on a StringColumn.

Columns views do not actually exist - they will be lazily evaluated when necessary.

Example
import numpy as np

import getml.data as data
import getml.engine as engine
import getml.data.roles as roles

# ----------------

engine.set_project("examples")

# ----------------
# Create a data frame from a JSON string

json_str = """{
    "names": ["patrick", "alex", "phil", "ulrike"],
    "column_01": [2.4, 3.0, 1.2, 1.4],
    "join_key": ["0", "1", "2", "3"],
    "time_stamp": ["2019-01-01", "2019-01-02", "2019-01-03", "2019-01-04"]
}"""

my_df = data.DataFrame(
    "MY DF",
    roles={
        "unused_string": ["names", "join_key", "time_stamp"],
        "unused_float": ["column_01"]}
).read_json(
    json_str
)

# ----------------

col1 = my_df["names"]

# ----------------

# col2 is a virtual column.
# The substring operation is not
# executed yet.
col2 = col1.substr(4, 3)

# This is where the Engine executes
# the substring operation.
my_df.add(col2, "short_names", roles.categorical)

# ----------------
# If you do not explicitly set a role,
# the assigned role will either be
# roles.unused_string.

# col3 is a virtual column.
# The operation is not
# executed yet.
col3 = "user-" + col1 + "-" + col2

# This is where the operation is
# is executed.
my_df["new_names"] = col3
my_df.set_role("new_names", roles.categorical)
Source code in getml/data/columns/columns.py
568
569
570
571
572
573
574
575
576
577
578
579
580
581
def __init__(
    self,
    operator: str,
    operand1: Optional[Union[str, _Column, _View]],
    operand2: Optional[Union[str, _Column, _View]],
):
    self.cmd: Dict[str, Any] = {}

    self.cmd["type_"] = STRING_COLUMN_VIEW
    self.cmd["operator_"] = operator
    if operand1 is not None:
        self.cmd["operand1_"] = self._parse_operand(operand1)
    if operand2 is not None:
        self.cmd["operand2_"] = self._parse_operand(operand2)

arange

arange(
    start: Union[Real, float] = 0.0,
    stop: Optional[Union[Real, float]] = None,
    step: Union[Real, float] = 1.0,
)

Returns evenly spaced variables, within a given interval.

PARAMETER DESCRIPTION
start

The beginning of the interval. Defaults to 0.

TYPE: Union[Real, float] DEFAULT: 0.0

stop

The end of the interval.

TYPE: Optional[Union[Real, float]] DEFAULT: None

step

The step taken. Defaults to 1.

TYPE: Union[Real, float] DEFAULT: 1.0

Source code in getml/data/columns/columns.py
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def arange(
    start: Union[numbers.Real, float] = 0.0,
    stop: Optional[Union[numbers.Real, float]] = None,
    step: Union[numbers.Real, float] = 1.0,
):
    """
    Returns evenly spaced variables, within a given interval.

    Args:
        start:
            The beginning of the interval. Defaults to 0.

        stop:
            The end of the interval.

        step:
            The step taken. Defaults to 1.
    """
    if stop is None:
        stop = start
        start = 0.0

    if step is None:
        step = 1.0

    if not isinstance(start, numbers.Real):
        raise TypeError("'start' must be a real number")

    if not isinstance(stop, numbers.Real):
        raise TypeError("'stop' must be a real number")

    if not isinstance(step, numbers.Real):
        raise TypeError("'step' must be a real number")

    col = FloatColumnView(
        operator="arange",
        operand1=None,
        operand2=None,
    )

    col.cmd["start_"] = float(start)
    col.cmd["stop_"] = float(stop)
    col.cmd["step_"] = float(step)

    return col

rowid

rowid() -> FloatColumnView

Get the row numbers of the table.

RETURNS DESCRIPTION
FloatColumnView

(numerical) column containing the row id, starting with 0

Source code in getml/data/columns/columns.py
162
163
164
165
166
167
168
169
def rowid() -> FloatColumnView:
    """
    Get the row numbers of the table.

    Returns:
            (numerical) column containing the row id, starting with 0
    """
    return FloatColumnView(operator="rowid", operand1=None, operand2=None)

Collects the data necessary for displaying the column footer.

Bases: NamedTuple

Contains the data to be shown in the footer of the data frame or column.

aggregation

Lazily evaluated aggregation over a column.

Aggregation

Aggregation(alias, col, agg_type)

Lazily evaluated aggregation over a column.

Example
my_data_frame["my_column"].avg()
3.0
Source code in getml/data/columns/aggregation.py
29
30
31
32
33
def __init__(self, alias, col, agg_type):
    self.cmd: Dict[str, Any] = {}
    self.cmd["as_"] = alias
    self.cmd["col_"] = col.cmd
    self.cmd["type_"] = agg_type

get

get()

Receives the value of the aggregation over the column.

Source code in getml/data/columns/aggregation.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def get(self):
    """
    Receives the value of the aggregation over the column.
    """

    cmd: Dict[str, Any] = {}

    cmd["name_"] = ""
    cmd["type_"] = "FloatColumn.aggregate"

    cmd["aggregation_"] = self.cmd

    with comm.send_and_get_socket(cmd) as sock:
        msg = comm.recv_string(sock)
        if msg != "Success!":
            comm.handle_engine_exception(msg)
        mat = comm.recv_float_matrix(sock)

    return mat.ravel()[0]

format

Format the column

last_change

Returns the last time a data frame has been changed.

last_change_from_col

The last time any of the underlying data frames has been changed.

length

Returns the length of the column

length_property

The length of the column (number of rows in the data frame).

make_iter

Factory function for a function that can be used to iterate through a column.

parse

Parses the columns from a cmd

repr

ASCII representation of the column.

repr_html

HTML representation of the column.

subroles

The subroles of this column.

to_arrow

Transform column to a pyarrow.ChunkedArray

to_numpy

Transform column to a numpy array.

unique

Transform column to numpy array containing unique values

unit

The unit of this column.

from_value

from_value(
    val: Union[bool, str, int, float, datetime64]
) -> ReturnType

Creates an infinite column that contains the same value in all of its elements.

PARAMETER DESCRIPTION
val

The value you want to insert into your column.

TYPE: Union[bool, str, int, float, datetime64]

RETURNS DESCRIPTION
ReturnType

The column view containing the value.

Source code in getml/data/columns/from_value.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
def from_value(val: Union[bool, str, int, float, np.datetime64]) -> ReturnType:
    """
    Creates an infinite column that contains the same
    value in all of its elements.

    Args:
        val:
            The value you want to insert into your column.

    Returns:
        The column view containing the value.
    """
    cmd = _value_to_cmd(val)

    if isinstance(val, bool):
        col: ReturnType = BooleanColumnView(
            operator="const",
            operand1=None,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, str):
        col = StringColumnView(
            operator="const",
            operand1=val,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, (int, float, numbers.Number)):
        col = FloatColumnView(
            operator="const",
            operand1=val,
            operand2=None,
        )
        col.cmd = cmd
        return col

    if isinstance(val, np.datetime64):
        col = FloatColumnView(
            operator="const",
            operand1=np.datetime64(val, "s").astype(float),
            operand2=None,
        )
        col.cmd = cmd
        return col

    raise TypeError("val must be bool, str or a number.")

random

random(seed: int = 5849) -> FloatColumnView

Create random column.

The numbers will be uniformly distributed from 0.0 to 1.0. This can be used to randomly split a population table into a training and a test set

PARAMETER DESCRIPTION
seed

Seed used for the random number generator.

TYPE: int DEFAULT: 5849

RETURNS DESCRIPTION
FloatColumnView

FloatColumn containing random numbers

Example
population = getml.DataFrame('population')
population.add(numpy.zeros(100), 'column_01')

idx = random(seed=42)
population_train = population[idx > 0.7]
population_test = population[idx <= 0.7]
Source code in getml/data/columns/random.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def random(seed: int = 5849) -> FloatColumnView:
    """
    Create random column.

    The numbers will be uniformly distributed from 0.0 to 1.0. This can be
    used to randomly split a population table into a training and a test
    set

    Args:
        seed:
            Seed used for the random number generator.

    Returns:
            FloatColumn containing random numbers

    ??? example
        ```python
        population = getml.DataFrame('population')
        population.add(numpy.zeros(100), 'column_01')

        idx = random(seed=42)
        population_train = population[idx > 0.7]
        population_test = population[idx <= 0.7]
        ```
    """

    if not isinstance(seed, numbers.Real):
        raise TypeError("'seed' must be a real number")

    col = FloatColumnView(operator="random", operand1=None, operand2=None)
    col.cmd["seed_"] = seed
    return col