from numpy import number
from pandas import DataFrame, Series
from fairxai.data.descriptor.base_descriptor import BaseDatasetDescriptor
from fairxai.logger import logger
[docs]
class TabularDatasetDescriptor(BaseDatasetDescriptor):
"""
Handles the description of a tabular dataset by categorizing its columns into
categorical, ordinal, and numeric types and providing summary statistics.
This class requires **explicit declaration** of all non-numeric columns through
the `categorical_columns` and `ordinal_columns` parameters. Columns not listed
there and not recognized as numeric (based on their dtype) will raise a
`ValueError` during the description process.
It provides methods to describe the dataset, retrieve
specific column types, and export the computed descriptions as a dictionary.
Attributes:
data (DataFrame): The main tabular dataset for analysis.
categorical_columns (list): A list of column names which are considered
categorical variables.
ordinal_columns (list): A list of column names which are considered ordinal
variables.
Methods:
describe():
Describes the dataset by categorizing its columns and computing summary
statistics for each type.
get_numeric_columns():
Returns the names of numeric columns.
get_categorical_columns():
Returns the list of categorical column names.
get_ordinal_columns():
Retrieves the list of ordinal column names.
"""
def __init__(self, data: DataFrame, categorical_columns: list = None, ordinal_columns: list = None):
super().__init__(data)
self.data = data
self.categorical_columns = categorical_columns or []
self.ordinal_columns = ordinal_columns or []
# Internal storage for column descriptions
self.numeric = {}
self.categorical = {}
self.ordinal = {}
self.target_desc = None # Holds target statistics if provided
[docs]
def describe(self, target: Series = None, target_name: str = None) -> dict:
"""
Compute column descriptors for numeric, categorical, and ordinal features.
Args:
target: optional target column (Series). If provided, its summary will be
included under 'target' in the returned descriptor.
target_name: optional target column name.
Returns:
dict: Descriptor dictionary including features and optional target.
"""
df = self.data
self.numeric.clear()
self.categorical.clear()
self.ordinal.clear()
self.target_desc = None
try:
for feature in df.columns:
index = df.columns.get_loc(feature)
col_type = self._get_column_type(feature, df)
if col_type == 'categorical':
self.categorical[feature] = self._create_categorical_description(df[feature], index)
elif col_type == 'ordinal':
self.ordinal[feature] = self._create_categorical_description(df[feature], index)
else:
self.numeric[feature] = self._create_numeric_description(df[feature], index)
# --- Add target description if provided ---
if target is not None and target_name is not None:
self.target_desc = {
target_name: {
'index': len(df.columns),
'distinct_values': list(target.unique()),
'count': {x: int((target == x).sum()) for x in target.unique()}
}
}
except ValueError as e:
logger.error(f"Error during column type determination: {e}")
raise
return self._as_dict()
def _get_column_type(self, feature: str, df: DataFrame) -> str:
"""
Determines the type of column (categorical, ordinal, or numeric).
Args:
feature: Column name
df: DataFrame containing the data
Returns:
Column type as string
"""
if feature in self.categorical_columns:
return "categorical"
elif feature in self.ordinal_columns:
return "ordinal"
elif feature in df.select_dtypes(include=number).columns.tolist():
return "numeric"
else:
raise ValueError(f"Unknown column type for column '{feature}'.")
def _create_categorical_description(self, series: Series, index: int) -> dict:
"""
Creates the description for a categorical or ordinal column.
Args:
series: Pandas series containing the column data
index: Column index in the DataFrame
Returns:
Dictionary with column statistics
"""
unique_values = series.unique()
return {
'index': index,
'distinct_values': list(unique_values),
'count': {x: int((series == x).sum()) for x in unique_values}
}
def _create_numeric_description(self, series: Series, index: int) -> dict:
"""
Creates the description for a numeric column.
Args:
series: Pandas series containing the column data
index: Column index in the DataFrame
Returns:
Dictionary with column statistics
"""
return {
'index': index,
'min': series.min(),
'max': series.max(),
'mean': series.mean(),
'std': series.std(),
'median': series.median(),
'q1': series.quantile(0.25),
'q3': series.quantile(0.75)
}
# --- Utility methods ---
[docs]
def get_numeric_columns(self):
"""
Returns the names of numeric columns.
Returns:
list: A list containing the names of numeric columns.
"""
return list(self.numeric.keys())
[docs]
def get_categorical_columns(self):
"""
Returns the list of categorical column names.
Returns:
List[str]: A list containing the names of categorical columns.
"""
return list(self.categorical.keys())
[docs]
def get_ordinal_columns(self):
"""
Retrieves the list of ordinal column names.
Returns:
list: A list of column names corresponding to ordinal data.
"""
return list(self.ordinal.keys())
def _as_dict(self, target: Series = None, target_name: str = None):
"""
Converts the dataset descriptor to a dict.
Optionally adds the target if provided.
Args:
target: Optional target series
target_name: Name of the target column
"""
descriptor = {
'numeric': self.numeric,
'categorical': self.categorical,
'ordinal': self.ordinal
}
# Target info is optional; only add if present
if self.target_desc is not None:
descriptor['target'] = self.target_desc
return descriptor