Source code for lore_sa.dataset.tabular_dataset

from lore_sa.dataset import Dataset

from lore_sa.logger import logger
import pandas as pd
from pandas import DataFrame
import numpy as np

__all__ = ["TabularDataset","Dataset"]


[docs]class TabularDataset(Dataset): """ It provides an interface to handle datasets, including some essential information on the structure and semantic of the dataset. Attributes: df (pandas.DataFrame): dataframe containing the whole dataset descriptor (dict): it contains the essential informationregarding each feature. Format: >>> {'numeric': {'feature name' : { 'index' : <index of feature column>, 'min' : <min value>, 'max' : <max value>, 'mean': <mean value>, 'std': <standard deviation>, 'median': <median value>, 'q1': <first quartile of the distribution>, 'q3': <third quartile of the distribution, }, ..., ..., }, 'categorical: {'feature name': { 'index' : <index of feature column>, 'distinct_values' : <distinct categorical values>, 'value_counts' : {'distinct value' : <elements count>, ... } } }, ... ... ... } """
[docs] def __init__(self,data: DataFrame, class_name:str = None): self.class_name = class_name self.df = data #target columns forced to be the last column of the dataset if class_name is not None: self.df = self.df[[x for x in self.df.columns if x != class_name] + [class_name]] self.descriptor = {'numeric':{}, 'categorical':{}} #creation of a default version of descriptor self.update_descriptor()
[docs] def update_descriptor(self): """ it creates the dataset descriptor dictionary """ self.descriptor = {'numeric':{}, 'categorical':{}} for feature in self.df.columns: index = self.df.columns.get_loc(feature) if feature in self.df.select_dtypes(include=np.number).columns.tolist(): #numerical desc = {'index': index, 'min' : self.df[feature].min(), 'max' : self.df[feature].max(), 'mean':self.df[feature].mean(), 'std':self.df[feature].std(), 'median':self.df[feature].median(), 'q1':self.df[feature].quantile(0.25), 'q3':self.df[feature].quantile(0.75), } self.descriptor['numeric'][feature] = desc else: #categorical feature desc = {'index': index, 'distinct_values' : list(self.df[feature].unique()), 'count' : {x : len(self.df[self.df[feature] == x]) for x in list(self.df[feature].unique())}} self.descriptor['categorical'][feature] = desc self.descriptor = self.set_target_label(self.descriptor)
[docs] def set_target_label(self, descriptor): """ Set the target column into the dataset descriptor :param descriptor: :return: """ if self.class_name is None: logger.warning("No target class is defined") return descriptor for type in descriptor: for k in descriptor[type]: if k == self.class_name: descriptor['target'] = {k:descriptor[type][k]} descriptor[type].pop(k) return descriptor return descriptor
[docs] @classmethod def from_csv(cls, filename: str, class_name: str=None): """ Read a comma-separated values (csv) file into Dataset object. :param [str] filename: :param class_name: optional :return: """ df = pd.read_csv(filename, skipinitialspace=True, na_values='?', keep_default_na=True) dataset_obj = cls(df, class_name=class_name) dataset_obj.filename = filename logger.info('{0} file imported'.format(filename)) return dataset_obj
[docs] @classmethod def from_dict(cls, data: dict, class_name: str=None): """ From dicts of Series, arrays, or dicts. :param [dict] data: :param class_name: optional :return: """ return cls(pd.DataFrame(data), class_name=class_name)
[docs] def set_class_name(self,class_name: str): """ Set the class name. Only the column name string :param [str] class_name: :return: """ self.class_name = class_name
[docs] def get_class_values(self): """ return the list of values of the target column :return: """ if self.class_name is None: raise Exception("ERR: class_name is None. Set class_name with set_class_name('<column name>')") return self.descriptor['target'][self.class_name]['distinct_values']
def get_numeric_columns(self): numeric_columns = list(self.df._get_numeric_data().columns) return numeric_columns def get_features_names(self): return list(self.df.columns)