Source code for fairxai.data.descriptor.text_descriptor

from .base_descriptor import BaseDatasetDescriptor


[docs] class TextDatasetDescriptor(BaseDatasetDescriptor): """ Descriptor for text datasets that analyzes and describes textual data. This class extends BaseDatasetDescriptor to provide specific functionality for text-based datasets, supporting both raw text strings and dictionary formats. """
[docs] def describe(self) -> dict: """ Analyzes the text dataset and returns a dictionary with descriptive information. Returns: dict: A dictionary containing: - type: Always "text" - n_documents: Total number of documents - input_format: Either "dict" or "raw_text" - Additional format-specific metadata Raises: ValueError: If the dataset is empty TypeError: If the data format is not supported (not string or dict) """ # Retrieve data and count documents data = self.data n_docs = len(data) # Check that the dataset is not empty if n_docs == 0: raise ValueError("Empty dataset") # Analyze the first element to determine the format sample = data[0] desc = {"type": "text", "n_documents": n_docs} # Handle dictionary format if isinstance(sample, dict): keys = list(sample.keys()) has_timestamp = "timestamp" in keys desc.update({ "input_format": "dict", "structure": keys, # List of keys in the dictionary "has_timestamp": has_timestamp # Whether timestamp field exists }) # Handle raw text format elif isinstance(sample, str): # Calculate the average length in words across all documents avg_len = sum(len(t.split()) for t in data) / n_docs desc.update({ "input_format": "raw_text", "avg_length_words": avg_len }) # Reject unsupported formats else: raise TypeError("Unsupported text format (use string or dict)") return desc