getml.data.split

Splits data into a training, testing, validation or other sets.

concat

concat(
    name: str, **kwargs: DataFrame
) -> Tuple[DataFrame, StringColumnView]

Concatenates several data frames into and produces a split column that keeps track of their origin.

PARAMETER	DESCRIPTION
`name`	The name of the data frame you would like to create. TYPE: `str`
`kwargs`	The data frames you would like to concat with the name in which they should appear in the split column. TYPE: `DataFrame` DEFAULT: `{}`

RETURNS	DESCRIPTION
`Tuple[DataFrame, StringColumnView]`	A tuple containing the concatenated data frame and the split column.

Example

A common use case for this functionality are TimeSeries:

data_train = getml.DataFrame.from_pandas(
    datatraining_pandas, name='data_train')

data_validate = getml.DataFrame.from_pandas(
    datatest_pandas, name='data_validate')

data_test = getml.DataFrame.from_pandas(
    datatest2_pandas, name='data_test')

population, split = getml.data.split.concat(
    "population", train=data_train, validate=data_validate, test=data_test)

...

time_series = getml.data.TimeSeries(
    population=population, split=split)

my_pipeline.fit(time_series.train)

Source code in getml/data/split/concat.py

def concat(name: str, **kwargs: DataFrame) -> Tuple[DataFrame, StringColumnView]:
    """
    Concatenates several data frames into and produces a split
    column that keeps track of their origin.

    Args:
        name:
            The name of the data frame you would like to create.

        kwargs:
            The data frames you would like
            to concat with the name in which they should appear
            in the split column.

    Returns:
        A tuple containing the concatenated data frame and the split column.

    ??? example
        A common use case for this functionality are [`TimeSeries`][getml.data.TimeSeries]:
        ```python
        data_train = getml.DataFrame.from_pandas(
            datatraining_pandas, name='data_train')

        data_validate = getml.DataFrame.from_pandas(
            datatest_pandas, name='data_validate')

        data_test = getml.DataFrame.from_pandas(
            datatest2_pandas, name='data_test')

        population, split = getml.data.split.concat(
            "population", train=data_train, validate=data_validate, test=data_test)

        ...

        time_series = getml.data.TimeSeries(
            population=population, split=split)

        my_pipeline.fit(time_series.train)
        ```
    """

    if not _is_non_empty_typed_list(list(kwargs.values()), [DataFrame, View]):
        raise ValueError(
            "'kwargs' must be non-empty and contain getml.DataFrames "
            + "or getml.data.Views."
        )

    names = list(kwargs.keys())

    first = kwargs[names[0]]

    population = first.copy(name) if isinstance(first, DataFrame) else first.to_df(name)

    split = from_value(names[0])

    assert isinstance(split, StringColumnView), "Should be a StringColumnView"

    for new_df_name in names[1:]:
        split = split.update(rowid() > population.nrows(), new_df_name)  # type: ignore
        population = _concat(name, [population, kwargs[new_df_name]])

    return population, split[: population.nrows()]  # type: ignore

random

random(
    seed: int = 5849,
    train: float = 0.8,
    test: float = 0.2,
    validation: float = 0,
    **kwargs: float
) -> StringColumnView

Returns a StringColumnView that can be used to randomly divide data into training, testing, validation or other sets.

PARAMETER	DESCRIPTION
`seed`	Seed used for the random number generator. TYPE: `int` DEFAULT: `5849`
`train`	The share of random samples assigned to the training set. TYPE: `float` DEFAULT: `0.8`
`validation`	The share of random samples assigned to the validation set. TYPE: `float` DEFAULT: `0`
`test`	The share of random samples assigned to the test set. TYPE: `float` DEFAULT: `0.2`
`kwargs`	Any other sets you would like to assign. You can name these sets whatever you want to (in our example, we called it 'other'). TYPE: `float` DEFAULT: `{}`

Example

split = getml.data.split.random(
    train=0.8, test=0.1, validation=0.05, other=0.05
)

train_set = data_frame[split=='train']
validation_set = data_frame[split=='validation']
test_set = data_frame[split=='test']
other_set = data_frame[split=='other']

Source code in getml/data/split/random.py

def random(
    seed: int = 5849,
    train: float = 0.8,
    test: float = 0.2,
    validation: float = 0,
    **kwargs: float,
) -> StringColumnView:
    """
    Returns a [`StringColumnView`][getml.data.columns.StringColumnView] that
    can be used to randomly divide data into training, testing,
    validation or other sets.

    Args:
        seed:
            Seed used for the random number generator.

        train:
            The share of random samples assigned to
            the training set.

        validation:
            The share of random samples assigned to
            the validation set.

        test:
            The share of random samples assigned to
            the test set.

        kwargs:
            Any other sets you would like to assign.
            You can name these sets whatever you want to (in our example,
            we called it 'other').

    ??? example
        ```python
        split = getml.data.split.random(
            train=0.8, test=0.1, validation=0.05, other=0.05
        )

        train_set = data_frame[split=='train']
        validation_set = data_frame[split=='validation']
        test_set = data_frame[split=='test']
        other_set = data_frame[split=='other']
        ```

    """

    values = np.asarray([train, validation, test] + list(kwargs.values()))

    if not _is_typed_list(values.tolist(), numbers.Real):
        raise ValueError("All values must be real numbers.")

    if np.abs(np.sum(values) - 1.0) > 0.0001:
        raise ValueError(
            "'train', 'validation', 'test' and all other sets must add up to 1, "
            + "but add up to "
            + str(np.sum(values))
            + "."
        )

    upper_bounds = np.cumsum(values)
    lower_bounds = upper_bounds - values

    names = ["train", "validation", "test"] + list(kwargs.keys())

    col: StringColumnView = from_value("train")  # type: ignore

    assert isinstance(col, StringColumnView), "Should be a StringColumnView"

    for i in range(len(names)):
        col = col.update(  # type: ignore
            (random_col(seed=seed) >= lower_bounds[i])  # type: ignore
            & (random_col(seed=seed) < upper_bounds[i]),
            names[i],
        )

    return col

time

time(
    population: DataFrame,
    time_stamp: Union[str, FloatColumn, FloatColumnView],
    validation: Optional[
        Union[float, int, datetime64]
    ] = None,
    test: Optional[Union[float, int, datetime64]] = None,
    **kwargs: Union[float, int, datetime64]
) -> StringColumnView

Returns a StringColumnView that can be used to divide data into training, testing, validation or other sets.

The arguments are key=value pairs of names (key) and starting points (value). The starting point defines the left endpoint of the subset. Intervals are left closed and right open, such that \([value, next value)\). The (unnamed) subset left from the first named starting point, i.e. \([0, first value)\), is always considered to be the training set.

PARAMETER	DESCRIPTION
`population`	The population table you would like to split. TYPE: `DataFrame`
`time_stamp`	The name of the time stamp column in the population table you want to use. Ideally, the role of said column would be `time_stamp`. If you want to split on the rowid, then pass "rowid" to `time_stamp`. TYPE: `Union[str, FloatColumn, FloatColumnView]`
`validation`	The start date of the validation set. TYPE: `Optional[Union[float, int, datetime64]]` DEFAULT: `None`
`test`	The start date of the test set. TYPE: `Optional[Union[float, int, datetime64]]` DEFAULT: `None`
`kwargs`	Any other sets you would like to assign. You can name these sets whatever you want to (in our example, we called it 'other'). TYPE: `Union[float, int, datetime64]` DEFAULT: `{}`

Example

validation_begin = getml.data.time.datetime(2010, 1, 1)
test_begin = getml.data.time.datetime(2011, 1, 1)
other_begin = getml.data.time.datetime(2012, 1, 1)

split = getml.data.split.time(
    population=data_frame,
    time_stamp="ds",
    test=test_begin,
    validation=validation_begin,
    other=other_begin
)

# Contains all data before 2010-01-01 (not included)
train_set = data_frame[split=='train']

# Contains all data between 2010-01-01 (included) and 2011-01-01 (not included)
validation_set = data_frame[split=='validation']

# Contains all data between 2011-01-01 (included) and 2012-01-01 (not included)
test_set = data_frame[split=='test']

# Contains all data after 2012-01-01 (included)
other_set = data_frame[split=='other']

Source code in getml/data/split/time.py

def time(
    population: DataFrame,
    time_stamp: Union[str, FloatColumn, FloatColumnView],
    validation: Optional[Union[float, int, np.datetime64]] = None,
    test: Optional[Union[float, int, np.datetime64]] = None,
    **kwargs: Union[float, int, np.datetime64],
) -> StringColumnView:
    """
    Returns a [`StringColumnView`][getml.data.columns.StringColumnView] that can be used to divide
    data into training, testing, validation or other sets.

    The arguments are
    `key=value` pairs of names (`key`) and starting points (`value`).
    The starting point defines the left endpoint of the subset. Intervals are left
    closed and right open, such that $[value, next value)$.  The (unnamed) subset
    left from the first named starting point, i.e.  $[0, first value)$, is always
    considered to be the training set.

    Args:
        population:
            The population table you would like to split.

        time_stamp:
            The name of the time stamp column in the population table
            you want to use. Ideally, the role of said column would be
            [`time_stamp`][getml.data.roles.time_stamp]. If you want to split on the rowid,
            then pass "rowid" to `time_stamp`.

        validation:
            The start date of the validation set.

        test:
            The start date of the test set.

        kwargs:
            Any other sets you would like to assign.
            You can name these sets whatever you want to (in our example,
            we called it 'other').

    ??? example
        ```python
        validation_begin = getml.data.time.datetime(2010, 1, 1)
        test_begin = getml.data.time.datetime(2011, 1, 1)
        other_begin = getml.data.time.datetime(2012, 1, 1)

        split = getml.data.split.time(
            population=data_frame,
            time_stamp="ds",
            test=test_begin,
            validation=validation_begin,
            other=other_begin
        )

        # Contains all data before 2010-01-01 (not included)
        train_set = data_frame[split=='train']

        # Contains all data between 2010-01-01 (included) and 2011-01-01 (not included)
        validation_set = data_frame[split=='validation']

        # Contains all data between 2011-01-01 (included) and 2012-01-01 (not included)
        test_set = data_frame[split=='test']

        # Contains all data after 2012-01-01 (included)
        other_set = data_frame[split=='other']
        ```
    """
    if not isinstance(population, (DataFrame, View)):
        raise ValueError("'population' must be a DataFrame or a View.")

    if not isinstance(time_stamp, (str, FloatColumn, FloatColumnView)):
        raise ValueError(
            "'time_stamp' must be a string, a FloatColumn, or a FloatColumnView."
        )

    if not test and not validation and not kwargs:
        raise ValueError("You have to supply at least one starting point.")

    defaults: Dict[str, Optional[Union[float, int, np.datetime64]]] = {
        "test": test,
        "validation": validation,
    }

    sets = {name: value for name, value in defaults.items() if value is not None}

    sets.update({**kwargs})

    values = np.asarray(list(sets.values()))
    index = np.argsort(values)
    values = values[index]

    if not _is_typed_list(values.tolist(), numbers.Real):
        raise ValueError("All values must be real numbers.")

    names = np.asarray(list(sets.keys()))
    names = names[index]

    if isinstance(time_stamp, str):
        time_stamp_col = (
            population[time_stamp] if time_stamp != "rowid" else population.rowid
        )
    else:
        time_stamp_col = time_stamp

    col: StringColumnView = from_value("train")  # type: ignore

    assert isinstance(col, StringColumnView), "Should be a StringColumnView"

    for i in range(len(names)):
        col = col.update(  # type: ignore
            time_stamp_col >= values[i],
            names[i],
        )

    return col