Skip to content

getml.data.split

Splits data into a training, testing, validation or other sets.

concat

concat(
    name: str, **kwargs: DataFrame
) -> Tuple[DataFrame, StringColumnView]

Concatenates several data frames into and produces a split column that keeps track of their origin.

PARAMETER DESCRIPTION
name

The name of the data frame you would like to create.

TYPE: str

kwargs

The data frames you would like to concat with the name in which they should appear in the split column.

TYPE: DataFrame DEFAULT: {}

RETURNS DESCRIPTION
Tuple[DataFrame, StringColumnView]

A tuple containing the concatenated data frame and the split column.

Example

A common use case for this functionality are TimeSeries:

data_train = getml.DataFrame.from_pandas(
    datatraining_pandas, name='data_train')

data_validate = getml.DataFrame.from_pandas(
    datatest_pandas, name='data_validate')

data_test = getml.DataFrame.from_pandas(
    datatest2_pandas, name='data_test')

population, split = getml.data.split.concat(
    "population", train=data_train, validate=data_validate, test=data_test)

...

time_series = getml.data.TimeSeries(
    population=population, split=split)

my_pipeline.fit(time_series.train)

Source code in getml/data/split/concat.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def concat(name: str, **kwargs: DataFrame) -> Tuple[DataFrame, StringColumnView]:
    """
    Concatenates several data frames into and produces a split
    column that keeps track of their origin.

    Args:
        name:
            The name of the data frame you would like to create.

        kwargs:
            The data frames you would like
            to concat with the name in which they should appear
            in the split column.

    Returns:
        A tuple containing the concatenated data frame and the split column.

    ??? example
        A common use case for this functionality are [`TimeSeries`][getml.data.TimeSeries]:
        ```python
        data_train = getml.DataFrame.from_pandas(
            datatraining_pandas, name='data_train')

        data_validate = getml.DataFrame.from_pandas(
            datatest_pandas, name='data_validate')

        data_test = getml.DataFrame.from_pandas(
            datatest2_pandas, name='data_test')

        population, split = getml.data.split.concat(
            "population", train=data_train, validate=data_validate, test=data_test)

        ...

        time_series = getml.data.TimeSeries(
            population=population, split=split)

        my_pipeline.fit(time_series.train)
        ```
    """

    if not _is_non_empty_typed_list(list(kwargs.values()), [DataFrame, View]):
        raise ValueError(
            "'kwargs' must be non-empty and contain getml.DataFrames "
            + "or getml.data.Views."
        )

    names = list(kwargs.keys())

    first = kwargs[names[0]]

    population = first.copy(name) if isinstance(first, DataFrame) else first.to_df(name)

    split = from_value(names[0])

    assert isinstance(split, StringColumnView), "Should be a StringColumnView"

    for new_df_name in names[1:]:
        split = split.update(rowid() > population.nrows(), new_df_name)  # type: ignore
        population = _concat(name, [population, kwargs[new_df_name]])

    return population, split[: population.nrows()]  # type: ignore

random

random(
    seed: int = 5849,
    train: float = 0.8,
    test: float = 0.2,
    validation: float = 0,
    **kwargs: float
) -> StringColumnView

Returns a StringColumnView that can be used to randomly divide data into training, testing, validation or other sets.

PARAMETER DESCRIPTION
seed

Seed used for the random number generator.

TYPE: int DEFAULT: 5849

train

The share of random samples assigned to the training set.

TYPE: float DEFAULT: 0.8

validation

The share of random samples assigned to the validation set.

TYPE: float DEFAULT: 0

test

The share of random samples assigned to the test set.

TYPE: float DEFAULT: 0.2

kwargs

Any other sets you would like to assign. You can name these sets whatever you want to (in our example, we called it 'other').

TYPE: float DEFAULT: {}

Example
split = getml.data.split.random(
    train=0.8, test=0.1, validation=0.05, other=0.05
)

train_set = data_frame[split=='train']
validation_set = data_frame[split=='validation']
test_set = data_frame[split=='test']
other_set = data_frame[split=='other']
Source code in getml/data/split/random.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def random(
    seed: int = 5849,
    train: float = 0.8,
    test: float = 0.2,
    validation: float = 0,
    **kwargs: float,
) -> StringColumnView:
    """
    Returns a [`StringColumnView`][getml.data.columns.StringColumnView] that
    can be used to randomly divide data into training, testing,
    validation or other sets.

    Args:
        seed:
            Seed used for the random number generator.

        train:
            The share of random samples assigned to
            the training set.

        validation:
            The share of random samples assigned to
            the validation set.

        test:
            The share of random samples assigned to
            the test set.

        kwargs:
            Any other sets you would like to assign.
            You can name these sets whatever you want to (in our example,
            we called it 'other').

    ??? example
        ```python
        split = getml.data.split.random(
            train=0.8, test=0.1, validation=0.05, other=0.05
        )

        train_set = data_frame[split=='train']
        validation_set = data_frame[split=='validation']
        test_set = data_frame[split=='test']
        other_set = data_frame[split=='other']
        ```

    """

    values = np.asarray([train, validation, test] + list(kwargs.values()))

    if not _is_typed_list(values.tolist(), numbers.Real):
        raise ValueError("All values must be real numbers.")

    if np.abs(np.sum(values) - 1.0) > 0.0001:
        raise ValueError(
            "'train', 'validation', 'test' and all other sets must add up to 1, "
            + "but add up to "
            + str(np.sum(values))
            + "."
        )

    upper_bounds = np.cumsum(values)
    lower_bounds = upper_bounds - values

    names = ["train", "validation", "test"] + list(kwargs.keys())

    col: StringColumnView = from_value("train")  # type: ignore

    assert isinstance(col, StringColumnView), "Should be a StringColumnView"

    for i in range(len(names)):
        col = col.update(  # type: ignore
            (random_col(seed=seed) >= lower_bounds[i])  # type: ignore
            & (random_col(seed=seed) < upper_bounds[i]),
            names[i],
        )

    return col

time

time(
    population: DataFrame,
    time_stamp: Union[str, FloatColumn, FloatColumnView],
    validation: Optional[
        Union[float, int, datetime64]
    ] = None,
    test: Optional[Union[float, int, datetime64]] = None,
    **kwargs: Union[float, int, datetime64]
) -> StringColumnView

Returns a StringColumnView that can be used to divide data into training, testing, validation or other sets.

The arguments are key=value pairs of names (key) and starting points (value). The starting point defines the left endpoint of the subset. Intervals are left closed and right open, such that \([value, next value)\). The (unnamed) subset left from the first named starting point, i.e. \([0, first value)\), is always considered to be the training set.

PARAMETER DESCRIPTION
population

The population table you would like to split.

TYPE: DataFrame

time_stamp

The name of the time stamp column in the population table you want to use. Ideally, the role of said column would be time_stamp. If you want to split on the rowid, then pass "rowid" to time_stamp.

TYPE: Union[str, FloatColumn, FloatColumnView]

validation

The start date of the validation set.

TYPE: Optional[Union[float, int, datetime64]] DEFAULT: None

test

The start date of the test set.

TYPE: Optional[Union[float, int, datetime64]] DEFAULT: None

kwargs

Any other sets you would like to assign. You can name these sets whatever you want to (in our example, we called it 'other').

TYPE: Union[float, int, datetime64] DEFAULT: {}

Example
validation_begin = getml.data.time.datetime(2010, 1, 1)
test_begin = getml.data.time.datetime(2011, 1, 1)
other_begin = getml.data.time.datetime(2012, 1, 1)

split = getml.data.split.time(
    population=data_frame,
    time_stamp="ds",
    test=test_begin,
    validation=validation_begin,
    other=other_begin
)

# Contains all data before 2010-01-01 (not included)
train_set = data_frame[split=='train']

# Contains all data between 2010-01-01 (included) and 2011-01-01 (not included)
validation_set = data_frame[split=='validation']

# Contains all data between 2011-01-01 (included) and 2012-01-01 (not included)
test_set = data_frame[split=='test']

# Contains all data after 2012-01-01 (included)
other_set = data_frame[split=='other']
Source code in getml/data/split/time.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def time(
    population: DataFrame,
    time_stamp: Union[str, FloatColumn, FloatColumnView],
    validation: Optional[Union[float, int, np.datetime64]] = None,
    test: Optional[Union[float, int, np.datetime64]] = None,
    **kwargs: Union[float, int, np.datetime64],
) -> StringColumnView:
    """
    Returns a [`StringColumnView`][getml.data.columns.StringColumnView] that can be used to divide
    data into training, testing, validation or other sets.

    The arguments are
    `key=value` pairs of names (`key`) and starting points (`value`).
    The starting point defines the left endpoint of the subset. Intervals are left
    closed and right open, such that $[value, next value)$.  The (unnamed) subset
    left from the first named starting point, i.e.  $[0, first value)$, is always
    considered to be the training set.

    Args:
        population:
            The population table you would like to split.

        time_stamp:
            The name of the time stamp column in the population table
            you want to use. Ideally, the role of said column would be
            [`time_stamp`][getml.data.roles.time_stamp]. If you want to split on the rowid,
            then pass "rowid" to `time_stamp`.

        validation:
            The start date of the validation set.

        test:
            The start date of the test set.

        kwargs:
            Any other sets you would like to assign.
            You can name these sets whatever you want to (in our example,
            we called it 'other').

    ??? example
        ```python
        validation_begin = getml.data.time.datetime(2010, 1, 1)
        test_begin = getml.data.time.datetime(2011, 1, 1)
        other_begin = getml.data.time.datetime(2012, 1, 1)

        split = getml.data.split.time(
            population=data_frame,
            time_stamp="ds",
            test=test_begin,
            validation=validation_begin,
            other=other_begin
        )

        # Contains all data before 2010-01-01 (not included)
        train_set = data_frame[split=='train']

        # Contains all data between 2010-01-01 (included) and 2011-01-01 (not included)
        validation_set = data_frame[split=='validation']

        # Contains all data between 2011-01-01 (included) and 2012-01-01 (not included)
        test_set = data_frame[split=='test']

        # Contains all data after 2012-01-01 (included)
        other_set = data_frame[split=='other']
        ```
    """
    if not isinstance(population, (DataFrame, View)):
        raise ValueError("'population' must be a DataFrame or a View.")

    if not isinstance(time_stamp, (str, FloatColumn, FloatColumnView)):
        raise ValueError(
            "'time_stamp' must be a string, a FloatColumn, or a FloatColumnView."
        )

    if not test and not validation and not kwargs:
        raise ValueError("You have to supply at least one starting point.")

    defaults: Dict[str, Optional[Union[float, int, np.datetime64]]] = {
        "test": test,
        "validation": validation,
    }

    sets = {name: value for name, value in defaults.items() if value is not None}

    sets.update({**kwargs})

    values = np.asarray(list(sets.values()))
    index = np.argsort(values)
    values = values[index]

    if not _is_typed_list(values.tolist(), numbers.Real):
        raise ValueError("All values must be real numbers.")

    names = np.asarray(list(sets.keys()))
    names = names[index]

    if isinstance(time_stamp, str):
        time_stamp_col = (
            population[time_stamp] if time_stamp != "rowid" else population.rowid
        )
    else:
        time_stamp_col = time_stamp

    col: StringColumnView = from_value("train")  # type: ignore

    assert isinstance(col, StringColumnView), "Should be a StringColumnView"

    for i in range(len(names)):
        col = col.update(  # type: ignore
            time_stamp_col >= values[i],
            names[i],
        )

    return col