Source code for fairxai.data.descriptor.text_descriptor

from .base_descriptor import BaseDatasetDescriptor



[docs]
class TextDatasetDescriptor(BaseDatasetDescriptor):
    """
    Descriptor for text datasets that analyzes and describes textual data.

    This class extends BaseDatasetDescriptor to provide specific functionality
    for text-based datasets, supporting both raw text strings and dictionary formats.
    """


[docs]
    def describe(self) -> dict:
        """
        Analyzes the text dataset and returns a dictionary with descriptive information.

        Returns:
            dict: A dictionary containing:
                - type: Always "text"
                - n_documents: Total number of documents
                - input_format: Either "dict" or "raw_text"
                - Additional format-specific metadata

        Raises:
            ValueError: If the dataset is empty
            TypeError: If the data format is not supported (not string or dict)
        """
        # Retrieve data and count documents
        data = self.data
        n_docs = len(data)

        # Check that the dataset is not empty
        if n_docs == 0:
            raise ValueError("Empty dataset")

        # Analyze the first element to determine the format
        sample = data[0]
        desc = {"type": "text", "n_documents": n_docs}

        # Handle dictionary format
        if isinstance(sample, dict):
            keys = list(sample.keys())
            has_timestamp = "timestamp" in keys
            desc.update({
                "input_format": "dict",
                "structure": keys,  # List of keys in the dictionary
                "has_timestamp": has_timestamp  # Whether timestamp field exists
            })

        # Handle raw text format
        elif isinstance(sample, str):
            # Calculate the average length in words across all documents
            avg_len = sum(len(t.split()) for t in data) / n_docs
            desc.update({
                "input_format": "raw_text",
                "avg_length_words": avg_len
            })

        # Reject unsupported formats
        else:
            raise TypeError("Unsupported text format (use string or dict)")

        return desc