Feature Definitions¶
FeatureDefinitions(perfdb)
¶
Class used for handling feature definitions. Can be accessed via perfdb.features.definitions.
Parameters:
Source code in echo_postgres/feature_definitions.py
def __init__(self, perfdb: e_pg.PerfDB) -> None:
"""Class used for handling feature definitions. Can be accessed via `perfdb.features.definitions`.
Parameters
----------
perfdb : PerfDB
Top level object carrying all functionality and the connection handler.
"""
super().__init__(perfdb)
from .feature_definition_attributes import FeatureDefinitionAttributes
# * subclasses
self.attributes = FeatureDefinitionAttributes(perfdb)
get(object_names=None, object_models=None, feature_names=None, data_source_types=None, attributes=None, filter_type='and', regex_feature_names=False, get_attributes=False, attribute_names=None, output_type='DataFrame')
¶
Gets all feature definitions.
The most useful keys/columns returned are:
- display_name
- description
- object_model_name
- data_source_type_name
- name_in_data_source
- id_in_data_source
Parameters:
-
(object_names¶list[str] | None, default:None) –List of object names to filter the results. If set, will add the models of these objects to the object_models filter. By default None
-
(object_models¶list[str] | None, default:None) –List of object model names to filter the results. By default None
-
(feature_names¶list[str] | None, default:None) –List of feature names to filter the results. By default None
-
(data_source_types¶list[str] | None, default:None) –List of data source type names to filter the results. By default None
-
(attributes¶dict[str, Any] | None, default:None) –Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...} By default None
-
(filter_type¶Literal['and', 'or'], default:'and') –How to treat multiple filters. Can be one of ["and", "or"]. By default "and"
-
(regex_feature_names¶bool, default:False) –If True, will treat the feature_names as regex patterns.
For example, if feature_names=['^active.', '^reactive.'], it will return all features that start with 'active' or 'reactive'.
By default False
-
(get_attributes¶bool, default:False) –If True, will also get the attributes of the features. It's highly recommended to specify the attribute names to get to speed up the query.
By default False
-
(attribute_names¶list[str] | None, default:None) –List of attribute names to get if get_attributes is True. In case get_attributes is False, this parameter is ignored. If None, all attributes are returned.
By default None
-
(output_type¶Literal['dict', 'DataFrame', 'pl.DataFrame'], default:'DataFrame') –Output type of the data. Can be one of ["dict", "DataFrame", "pl.DataFrame"] By default "DataFrame"
Returns:
-
DataFrame–If output_type is "DataFrame", returns a pandas DataFrame with the following format: index=MultiIndex[object_model_name, name], columns=[attribute, ...]
-
DataFrame–If output_type is "pl.DataFrame", returns a Polars DataFrame
-
dict[str, dict[str, dict[str, Any]]]–If output_type is "dict", returns a dictionary with the following format: {object_model_name: {feature_name: {attribute: value, ...}, ...}, ...}
Source code in echo_postgres/feature_definitions.py
@validate_call
def get(
self,
object_names: list[str] | None = None,
object_models: list[str] | None = None,
feature_names: list[str] | None = None,
data_source_types: list[str] | None = None,
attributes: dict[str, Any] | None = None,
filter_type: Literal["and", "or"] = "and",
regex_feature_names: bool = False,
get_attributes: bool = False,
attribute_names: list[str] | None = None,
output_type: Literal["dict", "DataFrame", "pl.DataFrame"] = "DataFrame",
) -> pd.DataFrame | pl.DataFrame | dict[str, dict[str, dict[str, Any]]]:
"""Gets all feature definitions.
The most useful keys/columns returned are:
- display_name
- description
- object_model_name
- data_source_type_name
- name_in_data_source
- id_in_data_source
Parameters
----------
object_names : list[str] | None, optional
List of object names to filter the results. If set, will add the models of these objects to the object_models filter.
By default None
object_models : list[str] | None, optional
List of object model names to filter the results.
By default None
feature_names : list[str] | None, optional
List of feature names to filter the results.
By default None
data_source_types : list[str] | None, optional
List of data source type names to filter the results.
By default None
attributes : dict[str, Any] | None, optional
Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...}
By default None
filter_type : Literal["and", "or"], optional
How to treat multiple filters. Can be one of ["and", "or"].
By default "and"
regex_feature_names : bool, optional
If True, will treat the feature_names as regex patterns.
For example, if feature_names=['^active.*', '^reactive.*'], it will return all features that start with 'active' or 'reactive'.
By default False
get_attributes : bool, optional
If True, will also get the attributes of the features. It's highly recommended to specify the attribute names to get
to speed up the query.
By default False
attribute_names : list[str] | None, optional
List of attribute names to get if get_attributes is True. In case get_attributes is False, this parameter is ignored. If None, all attributes are returned.
By default None
output_type : Literal["dict", "DataFrame", "pl.DataFrame"], optional
Output type of the data. Can be one of ["dict", "DataFrame", "pl.DataFrame"]
By default "DataFrame"
Returns
-------
pd.DataFrame
If output_type is "DataFrame", returns a pandas DataFrame with the following format: index=MultiIndex[object_model_name, name], columns=[attribute, ...]
pl.DataFrame
If output_type is "pl.DataFrame", returns a Polars DataFrame
dict[str, dict[str, dict[str, Any]]]
If output_type is "dict", returns a dictionary with the following format: {object_model_name: {feature_name: {attribute: value, ...}, ...}, ...}
"""
# checking inputs
where = self._check_get_args(
object_names=object_names,
object_models=object_models,
feature_names=feature_names,
data_source_types=data_source_types,
attributes=attributes,
filter_type=filter_type,
regex_feature_names=regex_feature_names,
)
# getting the feature definitions
query = [
sql.SQL("SELECT * FROM performance.v_features "),
]
if where:
query.append(where)
query.append(sql.SQL(" ORDER BY object_model_name, name"))
query = sql.Composed(query)
df = self._perfdb.conn.read_to_polars(query, schema_overrides=self._cols_schema)
# getting attributes
if get_attributes:
# names of the models
got_models = df["object_model_name"].unique().to_list()
all_attrs_list = []
for model in got_models:
# features of the model
got_model_features = df.filter(pl.col("object_model_name") == model)["name"].to_list()
# getting the attributes
model_attrs: pl.DataFrame = self._perfdb.features.definitions.attributes.get(
object_models=[model],
feature_names=got_model_features,
attribute_names=attribute_names,
output_type="pl.DataFrame",
values_only=True,
)
all_attrs_list.append(model_attrs)
if all_attrs_list:
all_attrs_df = pl.concat(all_attrs_list)
# rename feature_name to name to match main df
all_attrs_df = all_attrs_df.rename({"feature_name": "name"})
# pivot the attributes
all_attrs_df = all_attrs_df.pivot(
index=["object_model_name", "name"],
on="attribute_name",
values="attribute_value",
)
# merging the attributes with the feature definitions
df = df.join(all_attrs_df, on=["object_model_name", "name"], how="left")
return convert_output(
df,
output_type,
index_col=["object_model_name", "name"],
drop_id_cols=True,
nest_by_index=True,
)
get_details(object_name)
¶
Gets where (which table) each feature of an object is located in the database.
The most useful keys/columns returned are: - id - name - table_name - column_name
Parameters:
-
(object_name¶str) –Name of the desired object.
Returns:
-
DataFrame–DataFrame where index is the name of the features.
Source code in echo_postgres/feature_definitions.py
@validate_call
def get_details(self, object_name: str) -> pd.DataFrame:
"""Gets where (which table) each feature of an object is located in the database.
The most useful keys/columns returned are:
- id
- name
- table_name
- column_name
Parameters
----------
object_name : str
Name of the desired object.
Returns
-------
pd.DataFrame
DataFrame where index is the name of the features.
"""
query = sql.SQL("SELECT * FROM performance.fn_get_object_features_and_tables({object_name});").format(
object_name=sql.Literal(object_name),
)
df = self._perfdb.conn.read_to_polars(query)
df = df.to_pandas(use_pyarrow_extension_array=True)
df = df.set_index("name")
return df
get_ids(object_names=None, object_models=None, feature_names=None, data_source_types=None, attributes=None, filter_type='and', regex_feature_names=False)
¶
Gets all feature definitions and their respective ids.
Parameters:
-
(object_names¶list[str] | None, default:None) –List of object names to filter the results. If set, will add the models of these objects to the object_models filter. By default None
-
(object_models¶list[str] | None, default:None) –List of object model names to filter the results. By default None
-
(feature_names¶list[str] | None, default:None) –List of feature names to filter the results. By default None
-
(data_source_types¶list[str] | None, default:None) –List of data source type names to filter the results. By default None
-
(attributes¶dict[str, Any] | None, default:None) –Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...} By default None
-
(filter_type¶Literal['and', 'or'], default:'and') –How to treat multiple filters. Can be one of ["and", "or"]. By default "and"
-
(regex_feature_names¶bool, default:False) –If True, will treat the feature_names as regex patterns.
For example, if feature_names=['^active.', '^reactive.'], it will return all features that start with 'active' or 'reactive'.
By default False
Returns:
-
dict[str, dict[str, int]]–Dictionary with the following format: {object_model_name: {feature_name: feature_id, ...}, ...}
Source code in echo_postgres/feature_definitions.py
@validate_call
def get_ids(
self,
object_names: list[str] | None = None,
object_models: list[str] | None = None,
feature_names: list[str] | None = None,
data_source_types: list[str] | None = None,
attributes: dict[str, Any] | None = None,
filter_type: Literal["and", "or"] = "and",
regex_feature_names: bool = False,
) -> dict[str, dict[str, int]]:
"""Gets all feature definitions and their respective ids.
Parameters
----------
object_names : list[str] | None, optional
List of object names to filter the results. If set, will add the models of these objects to the object_models filter.
By default None
object_models : list[str] | None, optional
List of object model names to filter the results.
By default None
feature_names : list[str] | None, optional
List of feature names to filter the results.
By default None
data_source_types : list[str] | None, optional
List of data source type names to filter the results.
By default None
attributes : dict[str, Any] | None, optional
Dictionary with the attributes to filter the results. It must be in the format {attribute_name: attribute_value, ...}
By default None
filter_type : Literal["and", "or"], optional
How to treat multiple filters. Can be one of ["and", "or"].
By default "and"
regex_feature_names : bool, optional
If True, will treat the feature_names as regex patterns.
For example, if feature_names=['^active.*', '^reactive.*'], it will return all features that start with 'active' or 'reactive'.
By default False
Returns
-------
dict[str, dict[str, int]]
Dictionary with the following format:
{object_model_name: {feature_name: feature_id, ...}, ...}
"""
# checking inputs
where = self._check_get_args(
object_names=object_names,
object_models=object_models,
feature_names=feature_names,
data_source_types=data_source_types,
attributes=attributes,
filter_type=filter_type,
regex_feature_names=regex_feature_names,
)
# getting the feature definitions
query = [
sql.SQL("SELECT object_model_name, name, id FROM performance.v_features "),
]
if where:
query.append(where)
query.append(sql.SQL(" ORDER BY object_model_name, name"))
query = sql.Composed(query)
df = self._perfdb.conn.read_to_polars(query, schema_overrides=self._cols_schema)
# converting to dictionary
final_result = {}
for row in df.iter_rows(named=True):
final_result.setdefault(row["object_model_name"], {})[row["name"]] = row["id"]
return final_result