Skip to content

Config Module

Configuration and customization for textxtract package.

Classes:

Name Description
ExtractorConfig

Enhanced configuration options for text extraction with validation.

Classes

ExtractorConfig

Enhanced configuration options for text extraction with validation.

Methods:

Name Description
__init__
__repr__
from_file

Load configuration from a file (JSON, YAML, or TOML).

get_handler

Retrieve a handler for a given file extension.

get_handler_config

Get configuration specific to a handler.

register_handler

Register a custom file type handler.

to_dict

Convert configuration to dictionary.

Attributes:

Name Type Description
custom_handlers
encoding
extra_config
logging_format
logging_level
max_file_size
max_memory_usage
timeout
Source code in textxtract/core/config.py
class ExtractorConfig:
    """Enhanced configuration options for text extraction with validation."""

    def __init__(
        self,
        encoding: str = "utf-8",
        logging_level: str = "INFO",
        logging_format: Optional[str] = None,
        timeout: Optional[float] = None,
        max_file_size: Optional[int] = None,
        max_memory_usage: Optional[int] = None,
        custom_handlers: Optional[Dict[str, Callable]] = None,
        **kwargs,
    ):
        # Validate and set basic options
        self.encoding = self._validate_encoding(encoding)
        self.logging_level = self._validate_logging_level(logging_level)
        self.logging_format = (
            logging_format or "%(asctime)s %(levelname)s %(name)s: %(message)s"
        )
        self.timeout = self._validate_timeout(timeout)
        self.max_file_size = self._validate_max_file_size(max_file_size)
        self.max_memory_usage = max_memory_usage
        self.custom_handlers = custom_handlers or {}

        # Load from environment variables
        self._load_from_env()

        # Store additional kwargs for handler-specific config
        self.extra_config = kwargs

    def _validate_encoding(self, encoding: str) -> str:
        """Validate encoding parameter."""
        if not isinstance(encoding, str):
            raise ValueError("Encoding must be a string")

        # Test if encoding is valid
        try:
            "test".encode(encoding)
        except LookupError:
            raise ValueError(f"Invalid encoding: {encoding}")

        return encoding

    def _validate_logging_level(self, level: str) -> str:
        """Validate logging level parameter."""
        valid_levels = {"DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"}
        if level.upper() not in valid_levels:
            raise ValueError(
                f"Invalid logging level: {level}. Must be one of {valid_levels}"
            )
        return level.upper()

    def _validate_timeout(self, timeout: Optional[float]) -> Optional[float]:
        """Validate timeout parameter."""
        if timeout is not None:
            if not isinstance(timeout, (int, float)) or timeout <= 0:
                raise ValueError("Timeout must be a positive number")
        return timeout

    def _validate_max_file_size(self, size: Optional[int]) -> Optional[int]:
        """Validate max file size parameter."""
        if size is not None:
            if not isinstance(size, int) or size <= 0:
                raise ValueError("Max file size must be a positive integer")
        return size

    def _load_from_env(self):
        """Load configuration from environment variables."""
        # Override with environment variables if present
        env_encoding = os.getenv("TEXT_EXTRACTOR_ENCODING")
        if env_encoding:
            self.encoding = self._validate_encoding(env_encoding)

        env_logging = os.getenv("TEXT_EXTRACTOR_LOG_LEVEL")
        if env_logging:
            self.logging_level = self._validate_logging_level(env_logging)

        env_timeout = os.getenv("TEXT_EXTRACTOR_TIMEOUT")
        if env_timeout:
            try:
                self.timeout = float(env_timeout)
            except ValueError:
                pass  # Ignore invalid values

        env_max_size = os.getenv("TEXT_EXTRACTOR_MAX_FILE_SIZE")
        if env_max_size:
            try:
                self.max_file_size = int(env_max_size)
            except ValueError:
                pass  # Ignore invalid values

    def register_handler(self, extension: str, handler: Callable):
        """Register a custom file type handler."""
        if not extension.startswith("."):
            extension = f".{extension}"
        self.custom_handlers[extension.lower()] = handler

    def get_handler(self, extension: str) -> Optional[Callable]:
        """Retrieve a handler for a given file extension."""
        return self.custom_handlers.get(extension.lower())

    def get_handler_config(self, handler_name: str) -> Dict[str, Any]:
        """Get configuration specific to a handler."""
        base_config = {
            "encoding": self.encoding,
            "timeout": self.timeout,
            "max_file_size": self.max_file_size,
            "max_memory_usage": self.max_memory_usage,
        }

        # Add handler-specific config
        handler_config_key = f"{handler_name.lower()}_config"
        if handler_config_key in self.extra_config:
            base_config.update(self.extra_config[handler_config_key])

        return base_config

    def to_dict(self) -> Dict[str, Any]:
        """Convert configuration to dictionary."""
        return {
            "encoding": self.encoding,
            "logging_level": self.logging_level,
            "logging_format": self.logging_format,
            "timeout": self.timeout,
            "max_file_size": self.max_file_size,
            "max_memory_usage": self.max_memory_usage,
            "custom_handlers": {k: str(v) for k, v in self.custom_handlers.items()},
            **self.extra_config,
        }

    @classmethod
    def from_file(cls, config_path: Union[str, Path]) -> "ExtractorConfig":
        """Load configuration from a file (JSON, YAML, or TOML)."""
        config_path = Path(config_path)

        if not config_path.exists():
            raise FileNotFoundError(f"Configuration file not found: {config_path}")

        content = config_path.read_text()

        if config_path.suffix.lower() == ".json":
            import json

            config_data = json.loads(content)
        elif config_path.suffix.lower() in (".yaml", ".yml"):
            try:
                import yaml

                config_data = yaml.safe_load(content)
            except ImportError:
                raise ImportError("PyYAML is required to load YAML configuration files")
        elif config_path.suffix.lower() == ".toml":
            try:
                import tomli

                config_data = tomli.loads(content)
            except ImportError:
                raise ImportError("tomli is required to load TOML configuration files")
        else:
            raise ValueError(
                f"Unsupported configuration file format: {config_path.suffix}"
            )

        return cls(**config_data)

    def __repr__(self) -> str:
        return f"ExtractorConfig(encoding='{self.encoding}', logging_level='{self.logging_level}', timeout={self.timeout})"

Attributes

custom_handlers instance-attribute
custom_handlers = custom_handlers or {}
encoding instance-attribute
encoding = _validate_encoding(encoding)
extra_config instance-attribute
extra_config = kwargs
logging_format instance-attribute
logging_format = logging_format or '%(asctime)s %(levelname)s %(name)s: %(message)s'
logging_level instance-attribute
logging_level = _validate_logging_level(logging_level)
max_file_size instance-attribute
max_file_size = _validate_max_file_size(max_file_size)
max_memory_usage instance-attribute
max_memory_usage = max_memory_usage
timeout instance-attribute
timeout = _validate_timeout(timeout)

Functions

__init__
__init__(encoding='utf-8', logging_level='INFO', logging_format=None, timeout=None, max_file_size=None, max_memory_usage=None, custom_handlers=None, **kwargs)
Source code in textxtract/core/config.py
def __init__(
    self,
    encoding: str = "utf-8",
    logging_level: str = "INFO",
    logging_format: Optional[str] = None,
    timeout: Optional[float] = None,
    max_file_size: Optional[int] = None,
    max_memory_usage: Optional[int] = None,
    custom_handlers: Optional[Dict[str, Callable]] = None,
    **kwargs,
):
    # Validate and set basic options
    self.encoding = self._validate_encoding(encoding)
    self.logging_level = self._validate_logging_level(logging_level)
    self.logging_format = (
        logging_format or "%(asctime)s %(levelname)s %(name)s: %(message)s"
    )
    self.timeout = self._validate_timeout(timeout)
    self.max_file_size = self._validate_max_file_size(max_file_size)
    self.max_memory_usage = max_memory_usage
    self.custom_handlers = custom_handlers or {}

    # Load from environment variables
    self._load_from_env()

    # Store additional kwargs for handler-specific config
    self.extra_config = kwargs
__repr__
__repr__()
Source code in textxtract/core/config.py
def __repr__(self) -> str:
    return f"ExtractorConfig(encoding='{self.encoding}', logging_level='{self.logging_level}', timeout={self.timeout})"
from_file classmethod
from_file(config_path)

Load configuration from a file (JSON, YAML, or TOML).

Source code in textxtract/core/config.py
@classmethod
def from_file(cls, config_path: Union[str, Path]) -> "ExtractorConfig":
    """Load configuration from a file (JSON, YAML, or TOML)."""
    config_path = Path(config_path)

    if not config_path.exists():
        raise FileNotFoundError(f"Configuration file not found: {config_path}")

    content = config_path.read_text()

    if config_path.suffix.lower() == ".json":
        import json

        config_data = json.loads(content)
    elif config_path.suffix.lower() in (".yaml", ".yml"):
        try:
            import yaml

            config_data = yaml.safe_load(content)
        except ImportError:
            raise ImportError("PyYAML is required to load YAML configuration files")
    elif config_path.suffix.lower() == ".toml":
        try:
            import tomli

            config_data = tomli.loads(content)
        except ImportError:
            raise ImportError("tomli is required to load TOML configuration files")
    else:
        raise ValueError(
            f"Unsupported configuration file format: {config_path.suffix}"
        )

    return cls(**config_data)
get_handler
get_handler(extension)

Retrieve a handler for a given file extension.

Source code in textxtract/core/config.py
def get_handler(self, extension: str) -> Optional[Callable]:
    """Retrieve a handler for a given file extension."""
    return self.custom_handlers.get(extension.lower())
get_handler_config
get_handler_config(handler_name)

Get configuration specific to a handler.

Source code in textxtract/core/config.py
def get_handler_config(self, handler_name: str) -> Dict[str, Any]:
    """Get configuration specific to a handler."""
    base_config = {
        "encoding": self.encoding,
        "timeout": self.timeout,
        "max_file_size": self.max_file_size,
        "max_memory_usage": self.max_memory_usage,
    }

    # Add handler-specific config
    handler_config_key = f"{handler_name.lower()}_config"
    if handler_config_key in self.extra_config:
        base_config.update(self.extra_config[handler_config_key])

    return base_config
register_handler
register_handler(extension, handler)

Register a custom file type handler.

Source code in textxtract/core/config.py
def register_handler(self, extension: str, handler: Callable):
    """Register a custom file type handler."""
    if not extension.startswith("."):
        extension = f".{extension}"
    self.custom_handlers[extension.lower()] = handler
to_dict
to_dict()

Convert configuration to dictionary.

Source code in textxtract/core/config.py
def to_dict(self) -> Dict[str, Any]:
    """Convert configuration to dictionary."""
    return {
        "encoding": self.encoding,
        "logging_level": self.logging_level,
        "logging_format": self.logging_format,
        "timeout": self.timeout,
        "max_file_size": self.max_file_size,
        "max_memory_usage": self.max_memory_usage,
        "custom_handlers": {k: str(v) for k, v in self.custom_handlers.items()},
        **self.extra_config,
    }