Skip to content

getml.data.Roles

Roles dataclass

Roles(
    categorical: Iterable[str] = tuple(),
    join_key: Iterable[str] = tuple(),
    numerical: Iterable[str] = tuple(),
    target: Iterable[str] = tuple(),
    text: Iterable[str] = tuple(),
    time_stamp: Iterable[str] = tuple(),
    unused_float: Iterable[str] = tuple(),
    unused_string: Iterable[str] = tuple(),
)

Roles can be passed to DataFrame to predefine the roles assigned to certain columns.

ATTRIBUTE DESCRIPTION
categorical

Names of the categorical columns.

TYPE: Iterable[str]

join_key

Names of the join key columns.

TYPE: Iterable[str]

numerical

Names of the numerical columns.

TYPE: Iterable[str]

target

Names of the target columns.

TYPE: Iterable[str]

text

Names of the text columns.

TYPE: Iterable[str]

time_stamp

Names of the time stamp columns.

TYPE: Iterable[str]

unused_float

Names of the unused float columns.

TYPE: Iterable[str]

unused_string

Names of the unused string columns.

TYPE: Iterable[str]

Example
roles = getml.data.Roles(
    categorical=["col1", "col2"], target=["col3"]
)

df_expd = data.DataFrame.from_csv(
    fnames=["file1.csv", "file2.csv"],
    name="MY DATA FRAME",
    sep=';',
    quotechar='"',
    roles=roles
)

columns property

columns: Tuple[str, ...]

The name of all columns contained in the roles object.

RETURNS DESCRIPTION
Tuple[str, ...]

The names of all columns.

unused property

unused: List[str]

Names of all unused columns (unused_float + unused_string).

RETURNS DESCRIPTION
List[str]

A list of column names that are categorized as unused, combining both float and string types.

column

column(colname: str) -> Role

Gets the role of a column by its column name.

PARAMETER DESCRIPTION
colname

The name of the column.

TYPE: str

RETURNS DESCRIPTION
Role

The role of the column as a string.

Source code in getml/data/roles/container.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def column(self, colname: str) -> Role:
    """
    Gets the role of a column by its column name.

    Args:
        colname:
            The name of the column.

    Returns:
        The role of the column as a string.
    """
    for role in self:
        if colname in self[role]:
            return role
    raise ValueError("Column named '" + colname + "' not found.")

from_dict classmethod

from_dict(
    roles_dict: Mapping[Union[Role, str], Iterable[str]]
) -> Roles

Creates a roles object from a dictionary.

PARAMETER DESCRIPTION
roles_dict

A dictionary where keys are role names and values are lists of column names.

TYPE: Mapping[Union[Role, str], Iterable[str]]

RETURNS DESCRIPTION
Roles

A roles object.

Source code in getml/data/roles/container.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
@classmethod
def from_dict(cls, roles_dict: Mapping[Union[Role, str], Iterable[str]]) -> Roles:
    """
    Creates a roles object from a dictionary.

    Args:
        roles_dict:
            A dictionary where keys are role names and values are lists of column names.

    Returns:
        A roles object.
    """
    roles: Dict[Role, List[str]] = {}
    for role in roles_dict:
        if role not in roles_sets.all_:
            raise ValueError(
                INVALID_ROLE_ERROR_MESSAGE_TEMPLATE.format(candidate_role=role)
            )
        roles[role] = list(roles_dict[role])

    return cls(**roles)

from_mapping classmethod

from_mapping(roles_mapping: Mapping[str, Role]) -> Roles

Creates a roles object from a mapping of column names to roles.

PARAMETER DESCRIPTION
roles_mapping

A dictionary where keys are column names and values are role names.

TYPE: Mapping[str, Role]

RETURNS DESCRIPTION
Roles

A roles object.

Source code in getml/data/roles/container.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
@classmethod
def from_mapping(cls, roles_mapping: Mapping[str, Role]) -> Roles:
    """
    Creates a roles object from a mapping of column names to roles.

    Args:
        roles_mapping:
            A dictionary where keys are column names and values are role names.

    Returns:
        A roles object.
    """
    roles: Dict[Role, List[str]] = {
        cast(Role, field.name): [] for field in fields(cls)
    }
    for column, role in roles_mapping.items():
        roles[role].append(column)
    return cls.from_dict(roles)

infer

infer(colname: str) -> Role

Infers the role of a column by its name.

PARAMETER DESCRIPTION
colname

The name of the column to be inferred.

TYPE: str

RETURNS DESCRIPTION
Role

The role of the column as a string.

Source code in getml/data/roles/container.py
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def infer(self, colname: str) -> Role:
    """
    Infers the role of a column by its name.

    Args:
        colname:
            The name of the column to be inferred.

    Returns:
        The role of the column as a string.
    """
    warnings.warn(
        "The 'infer' method is deprecated and will be removed in a future "
        "release. To get a specific column's role, use 'column' instead.",
        DeprecationWarning,
    )
    return self.column(colname)

to_dict

to_dict() -> Dict[Role, List[str]]

Expresses the roles object as a dictionary.

RETURNS DESCRIPTION
Dict[Role, List[str]]

A dictionary where keys are role names and values are lists of column names.

Source code in getml/data/roles/container.py
213
214
215
216
217
218
219
220
def to_dict(self) -> Dict[Role, List[str]]:
    """
    Expresses the roles object as a dictionary.

    Returns:
        A dictionary where keys are role names and values are lists of column names.
    """
    return {role: list(self[role]) for role in self}

to_list

to_list() -> List[Role]

Returns a list containing the roles, without the corresponding columns names.

RETURNS DESCRIPTION
List[Role]

A list where each element is a role name, repeated by the number of columns in that role.

Source code in getml/data/roles/container.py
222
223
224
225
226
227
228
229
230
def to_list(self) -> List[Role]:
    """
    Returns a list containing the roles, without the corresponding
    columns names.

    Returns:
        A list where each element is a role name, repeated by the number of columns in that role.
    """
    return [role for role in self for _ in self[role]]

to_mapping

to_mapping() -> Dict[str, Role]

Maps column names to their roles.

RETURNS DESCRIPTION
Dict[str, Role]

A dictionary where keys are column names and values are role names.

Source code in getml/data/roles/container.py
232
233
234
235
236
237
238
239
def to_mapping(self) -> Dict[str, Role]:
    """
    Maps column names to their roles.

    Returns:
        A dictionary where keys are column names and values are role names.
    """
    return {column: role for role in self for column in self[role]}

update

update(other: Roles) -> Roles

Merges the roles of two roles objects.

PARAMETER DESCRIPTION
other

The roles object to be merged with the current one.

TYPE: Roles

RETURNS DESCRIPTION
Roles

A new roles object containing the merged roles.

Source code in getml/data/roles/container.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def update(self, other: Roles) -> Roles:
    """
    Merges the roles of two roles objects.

    Args:
        other:
            The roles object to be merged with the current one.

    Returns:
        A new roles object containing the merged roles.
    """

    current = self.to_mapping()
    new = other.to_mapping()

    updated: dict[str, Role] = {**current, **new}

    return Roles.from_mapping(updated)

validate

validate() -> None

Checks if the roles are consistent.

RAISES DESCRIPTION
ValueError

If the roles are inconsistent.

Source code in getml/data/roles/container.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
def validate(self) -> None:
    """
    Checks if the roles are consistent.

    Raises:
        ValueError:
            If the roles are inconsistent.
    """

    seen = dict()

    for role in self:
        if not _is_iterable_not_str_of_type(self[role], type=str):
            raise TypeError(
                f"Argument for '{role}' must be an iterable of column names "
                "(strings): Iterable[str]."
            )

        for column in self[role]:
            if (already_defined_role := seen.get(column)) is not None:
                raise ValueError(
                    f"Column names must be unique across all roles. Found "
                    f"duplicate roles set for column '{column}': '{role}' and "
                    f"'{already_defined_role}'."
                )
            else:
                seen.update({column: role})