Skip to content

Feature Definitions

FeatureDefinitions(perfdb)

Class used for handling feature definitions. Can be accessed via perfdb.features.definitions.

Parameters:

  • perfdb

    (PerfDB) –

    Top level object carrying all functionality and the connection handler.

Source code in echo_postgres/feature_definitions.py
Python
def __init__(self, perfdb: e_pg.PerfDB) -> None:
    """Class used for handling feature definitions. Can be accessed via `perfdb.features.definitions`.

    Parameters
    ----------
    perfdb : PerfDB
        Top level object carrying all functionality and the connection handler.
    """
    super().__init__(perfdb)

    from .feature_definition_attributes import FeatureDefinitionAttributes

    # * subclasses

    self.attributes = FeatureDefinitionAttributes(perfdb)

get(object_names=None, object_models=None, feature_names=None, data_source_types=None, attributes=None, filter_type='and', regex_feature_names=False, get_attributes=False, attribute_names=None, output_type='DataFrame')

Gets all feature definitions.

The most useful keys/columns returned are:

  • display_name
  • description
  • object_model_name
  • data_source_type_name
  • name_in_data_source
  • id_in_data_source

Parameters:

  • object_names

    (list[str] | None, default: None ) –

    List of object names to filter the results. If set, will add the models of these objects to the object_models filter. By default None

  • object_models

    (list[str] | None, default: None ) –

    List of object model names to filter the results. By default None

  • feature_names

    (list[str] | None, default: None ) –

    List of feature names to filter the results. By default None

  • data_source_types

    (list[str] | None, default: None ) –

    List of data source type names to filter the results. By default None

  • attributes

    (dict[str, Any] | None, default: None ) –

    Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...} By default None

  • filter_type

    (Literal['and', 'or'], default: 'and' ) –

    How to treat multiple filters. Can be one of ["and", "or"]. By default "and"

  • regex_feature_names

    (bool, default: False ) –

    If True, will treat the feature_names as regex patterns.

    For example, if feature_names=['^active.', '^reactive.'], it will return all features that start with 'active' or 'reactive'.

    By default False

  • get_attributes

    (bool, default: False ) –

    If True, will also get the attributes of the features. It's highly recommended to specify the attribute names to get to speed up the query.

    By default False

  • attribute_names

    (list[str] | None, default: None ) –

    List of attribute names to get if get_attributes is True. In case get_attributes is False, this parameter is ignored. If None, all attributes are returned.

    By default None

  • output_type

    (Literal['dict', 'DataFrame', 'pl.DataFrame'], default: 'DataFrame' ) –

    Output type of the data. Can be one of ["dict", "DataFrame", "pl.DataFrame"] By default "DataFrame"

Returns:

  • DataFrame

    If output_type is "DataFrame", returns a pandas DataFrame with the following format: index=MultiIndex[object_model_name, name], columns=[attribute, ...]

  • DataFrame

    If output_type is "pl.DataFrame", returns a Polars DataFrame

  • dict[str, dict[str, dict[str, Any]]]

    If output_type is "dict", returns a dictionary with the following format: {object_model_name: {feature_name: {attribute: value, ...}, ...}, ...}

Source code in echo_postgres/feature_definitions.py
Python
@validate_call
def get(
    self,
    object_names: list[str] | None = None,
    object_models: list[str] | None = None,
    feature_names: list[str] | None = None,
    data_source_types: list[str] | None = None,
    attributes: dict[str, Any] | None = None,
    filter_type: Literal["and", "or"] = "and",
    regex_feature_names: bool = False,
    get_attributes: bool = False,
    attribute_names: list[str] | None = None,
    output_type: Literal["dict", "DataFrame", "pl.DataFrame"] = "DataFrame",
) -> pd.DataFrame | pl.DataFrame | dict[str, dict[str, dict[str, Any]]]:
    """Gets all feature definitions.

    The most useful keys/columns returned are:

    - display_name
    - description
    - object_model_name
    - data_source_type_name
    - name_in_data_source
    - id_in_data_source

    Parameters
    ----------
    object_names : list[str] | None, optional
        List of object names to filter the results. If set, will add the models of these objects to the object_models filter.
        By default None
    object_models : list[str] | None, optional
        List of object model names to filter the results.
        By default None
    feature_names : list[str] | None, optional
        List of feature names to filter the results.
        By default None
    data_source_types : list[str] | None, optional
        List of data source type names to filter the results.
        By default None
    attributes : dict[str, Any] | None, optional
        Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...}
        By default None
    filter_type : Literal["and", "or"], optional
        How to treat multiple filters. Can be one of ["and", "or"].
        By default "and"
    regex_feature_names : bool, optional
        If True, will treat the feature_names as regex patterns.

        For example, if feature_names=['^active.*', '^reactive.*'], it will return all features that start with 'active' or 'reactive'.

        By default False
    get_attributes : bool, optional
        If True, will also get the attributes of the features. It's highly recommended to specify the attribute names to get
        to speed up the query.

        By default False
    attribute_names : list[str] | None, optional
        List of attribute names to get if get_attributes is True. In case get_attributes is False, this parameter is ignored. If None, all attributes are returned.

        By default None
    output_type : Literal["dict", "DataFrame", "pl.DataFrame"], optional
        Output type of the data. Can be one of ["dict", "DataFrame", "pl.DataFrame"]
        By default "DataFrame"

    Returns
    -------
    pd.DataFrame
        If output_type is "DataFrame", returns a pandas DataFrame with the following format: index=MultiIndex[object_model_name, name], columns=[attribute, ...]
    pl.DataFrame
        If output_type is "pl.DataFrame", returns a Polars DataFrame
    dict[str, dict[str, dict[str, Any]]]
        If output_type is "dict", returns a dictionary with the following format: {object_model_name: {feature_name: {attribute: value, ...}, ...}, ...}
    """
    # checking inputs
    where = self._check_get_args(
        object_names=object_names,
        object_models=object_models,
        feature_names=feature_names,
        data_source_types=data_source_types,
        attributes=attributes,
        filter_type=filter_type,
        regex_feature_names=regex_feature_names,
    )

    # getting the feature definitions
    query = [
        sql.SQL("SELECT * FROM performance.v_features "),
    ]
    if where:
        query.append(where)
    query.append(sql.SQL(" ORDER BY object_model_name, name"))

    query = sql.Composed(query)

    df = self._perfdb.conn.read_to_polars(query, schema_overrides=self._cols_schema)

    # getting attributes
    if get_attributes:
        # names of the models
        got_models = df["object_model_name"].unique().to_list()
        all_attrs_list = []
        for model in got_models:
            # features of the model
            got_model_features = df.filter(pl.col("object_model_name") == model)["name"].to_list()
            # getting the attributes
            model_attrs: pl.DataFrame = self._perfdb.features.definitions.attributes.get(
                object_models=[model],
                feature_names=got_model_features,
                attribute_names=attribute_names,
                output_type="pl.DataFrame",
                values_only=True,
            )
            all_attrs_list.append(model_attrs)
        if all_attrs_list:
            all_attrs_df = pl.concat(all_attrs_list)
            # rename feature_name to name to match main df
            all_attrs_df = all_attrs_df.rename({"feature_name": "name"})
            # pivot the attributes
            all_attrs_df = all_attrs_df.pivot(
                index=["object_model_name", "name"],
                on="attribute_name",
                values="attribute_value",
            )
            # merging the attributes with the feature definitions
            df = df.join(all_attrs_df, on=["object_model_name", "name"], how="left")

    return convert_output(
        df,
        output_type,
        index_col=["object_model_name", "name"],
        drop_id_cols=True,
        nest_by_index=True,
    )

get_details(object_name)

Gets where (which table) each feature of an object is located in the database.

The most useful keys/columns returned are: - id - name - table_name - column_name

Parameters:

  • object_name

    (str) –

    Name of the desired object.

Returns:

  • DataFrame

    DataFrame where index is the name of the features.

Source code in echo_postgres/feature_definitions.py
Python
@validate_call
def get_details(self, object_name: str) -> pd.DataFrame:
    """Gets where (which table) each feature of an object is located in the database.

    The most useful keys/columns returned are:
    - id
    - name
    - table_name
    - column_name

    Parameters
    ----------
    object_name : str
        Name of the desired object.

    Returns
    -------
    pd.DataFrame
        DataFrame where index is the name of the features.
    """
    query = sql.SQL("SELECT * FROM performance.fn_get_object_features_and_tables({object_name});").format(
        object_name=sql.Literal(object_name),
    )
    df = self._perfdb.conn.read_to_polars(query)

    df = df.to_pandas(use_pyarrow_extension_array=True)
    df = df.set_index("name")

    return df

get_ids(object_names=None, object_models=None, feature_names=None, data_source_types=None, attributes=None, filter_type='and', regex_feature_names=False)

Gets all feature definitions and their respective ids.

Parameters:

  • object_names

    (list[str] | None, default: None ) –

    List of object names to filter the results. If set, will add the models of these objects to the object_models filter. By default None

  • object_models

    (list[str] | None, default: None ) –

    List of object model names to filter the results. By default None

  • feature_names

    (list[str] | None, default: None ) –

    List of feature names to filter the results. By default None

  • data_source_types

    (list[str] | None, default: None ) –

    List of data source type names to filter the results. By default None

  • attributes

    (dict[str, Any] | None, default: None ) –

    Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...} By default None

  • filter_type

    (Literal['and', 'or'], default: 'and' ) –

    How to treat multiple filters. Can be one of ["and", "or"]. By default "and"

  • regex_feature_names

    (bool, default: False ) –

    If True, will treat the feature_names as regex patterns.

    For example, if feature_names=['^active.', '^reactive.'], it will return all features that start with 'active' or 'reactive'.

    By default False

Returns:

  • dict[str, dict[str, int]]

    Dictionary with the following format: {object_model_name: {feature_name: feature_id, ...}, ...}

Source code in echo_postgres/feature_definitions.py
Python
@validate_call
def get_ids(
    self,
    object_names: list[str] | None = None,
    object_models: list[str] | None = None,
    feature_names: list[str] | None = None,
    data_source_types: list[str] | None = None,
    attributes: dict[str, Any] | None = None,
    filter_type: Literal["and", "or"] = "and",
    regex_feature_names: bool = False,
) -> dict[str, dict[str, int]]:
    """Gets all feature definitions and their respective ids.

    Parameters
    ----------
    object_names : list[str] | None, optional
        List of object names to filter the results. If set, will add the models of these objects to the object_models filter.
        By default None
    object_models : list[str] | None, optional
        List of object model names to filter the results.
        By default None
    feature_names : list[str] | None, optional
        List of feature names to filter the results.
        By default None
    data_source_types : list[str] | None, optional
        List of data source type names to filter the results.
        By default None
    attributes : dict[str, Any] | None, optional
        Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...}
        By default None
    filter_type : Literal["and", "or"], optional
        How to treat multiple filters. Can be one of ["and", "or"].
        By default "and"
    regex_feature_names : bool, optional
        If True, will treat the feature_names as regex patterns.

        For example, if feature_names=['^active.*', '^reactive.*'], it will return all features that start with 'active' or 'reactive'.

        By default False

    Returns
    -------
    dict[str, dict[str, int]]
        Dictionary with the following format:
        {object_model_name: {feature_name: feature_id, ...}, ...}
    """
    # checking inputs
    where = self._check_get_args(
        object_names=object_names,
        object_models=object_models,
        feature_names=feature_names,
        data_source_types=data_source_types,
        attributes=attributes,
        filter_type=filter_type,
        regex_feature_names=regex_feature_names,
    )

    # getting the feature definitions
    query = [
        sql.SQL("SELECT object_model_name, name, id FROM performance.v_features "),
    ]
    if where:
        query.append(where)
    query.append(sql.SQL(" ORDER BY object_model_name, name"))

    query = sql.Composed(query)

    df = self._perfdb.conn.read_to_polars(query, schema_overrides=self._cols_schema)

    # converting to dictionary
    final_result = {}
    for row in df.iter_rows(named=True):
        final_result.setdefault(row["object_model_name"], {})[row["name"]] = row["id"]

    return final_result