Skip to content

Extractor Module

Synchronous text extraction logic with support for file paths and bytes.

Classes:

Name Description
SyncTextExtractor

Synchronous text extractor with support for file paths and bytes.

Attributes:

Name Type Description
logger

Attributes

logger module-attribute

logger = getLogger('textxtract.sync')

Classes

SyncTextExtractor

Bases: TextExtractor

Synchronous text extractor with support for file paths and bytes.

Provides synchronous text extraction from various file types. Logs debug and info level messages for tracing and diagnostics. Supports context manager protocol for proper cleanup.

Methods:

Name Description
__enter__

Context manager entry.

__exit__

Context manager exit.

__init__
extract

Extract text synchronously from file path or bytes.

Attributes:

Name Type Description
config
Source code in textxtract/sync/extractor.py
class SyncTextExtractor(TextExtractor):
    """
    Synchronous text extractor with support for file paths and bytes.

    Provides synchronous text extraction from various file types.
    Logs debug and info level messages for tracing and diagnostics.
    Supports context manager protocol for proper cleanup.
    """

    def __init__(self, config: Optional[ExtractorConfig] = None):
        self.config = config or ExtractorConfig()
        logger.debug(
            "SyncTextExtractor initialized with config: %s", self.config.__dict__
        )

    def extract(
        self,
        source: Union[Path, str, bytes],
        filename: Optional[str] = None,
        config: Optional[dict] = None,
    ) -> str:
        """
        Extract text synchronously from file path or bytes.

        Args:
            source: File path (Path/str) or file bytes
            filename: Required if source is bytes, optional for file paths
            config: Optional configuration overrides

        Returns:
            str: Extracted text.

        Raises:
            ValueError: If filename is missing when source is bytes
            FileTypeNotSupportedError: If the file extension is not supported.
            ExtractionError: If extraction fails.
            InvalidFileError: If the file is invalid or corrupted.
        """
        # Get file info for logging
        file_info = get_file_info(source, filename)
        logger.debug("Processing file: %s", file_info)

        # Prepare file path (create temp file if needed)
        file_path, temp_path = self._prepare_file_path(source, filename, config)

        try:
            # Validate file extension
            suffix = file_info.extension
            if not suffix:
                raise FileTypeNotSupportedError(
                    f"File has no extension: {file_info.filename}"
                )

            logger.debug("Detected file suffix: %s", suffix)

            # Get handler
            handler = registry.get_handler(suffix)
            handler_name = handler.__class__.__name__

            logger.info(
                "Using handler %s for file %s (size: %s MB, temp: %s)",
                handler_name,
                file_info.filename,
                file_info.size_mb,
                file_info.is_temp,
            )

            # Extract text
            try:
                result = handler.extract(file_path, config or self.config.__dict__)
            except Exception as e:
                logger.error(
                    "Extraction failed for file %s (handler: %s): %s",
                    file_info.filename,
                    handler_name,
                    e,
                )

                # Re-raise custom extraction errors
                if isinstance(e, ExtractionError):
                    raise
                # Wrap known invalid file errors
                if isinstance(e, (ValueError, OSError)):
                    raise InvalidFileError(
                        f"Invalid file: {file_info.filename} (handler: {handler_name}, error: {e})"
                    ) from e
                # Wrap as general extraction error
                raise ExtractionError(
                    f"Extraction failed for file {file_info.filename} using {handler_name}: {e}"
                ) from e

            logger.info(
                "Extraction successful for file %s (extracted %d characters)",
                file_info.filename,
                len(result),
            )
            return result

        finally:
            # Clean up temporary file if created
            if temp_path:
                safe_unlink(temp_path)
                logger.debug("Temporary file %s deleted", temp_path)

    def _prepare_file_path(
        self,
        source: Union[Path, str, bytes],
        filename: Optional[str],
        config: Optional[dict],
    ) -> tuple[Path, Optional[Path]]:
        """
        Prepare file path for extraction.

        Returns:
            tuple: (file_path, temp_path_if_created)
        """
        if isinstance(source, bytes):
            # Handle bytes input - create temporary file
            if not filename:
                raise ValueError("filename is required when source is bytes")

            temp_path = create_temp_file(
                source, filename, config and config.get("max_file_size")
            )
            logger.debug(
                "Temporary file created at %s for filename %s", temp_path, filename
            )
            return temp_path, temp_path
        else:
            # Handle file path input
            file_path = Path(source)
            if not file_path.exists():
                raise InvalidFileError(f"File not found: {file_path}")
            if not file_path.is_file():
                raise InvalidFileError(f"Path is not a file: {file_path}")

            logger.debug("Using existing file: %s", file_path)
            return file_path, None

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        pass  # No resources to clean up for sync extractor

Attributes

config instance-attribute
config = config or ExtractorConfig()

Functions

__enter__
__enter__()

Context manager entry.

Source code in textxtract/sync/extractor.py
def __enter__(self):
    """Context manager entry."""
    return self
__exit__
__exit__(exc_type, exc_val, exc_tb)

Context manager exit.

Source code in textxtract/sync/extractor.py
def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit."""
    pass  # No resources to clean up for sync extractor
__init__
__init__(config=None)
Source code in textxtract/sync/extractor.py
def __init__(self, config: Optional[ExtractorConfig] = None):
    self.config = config or ExtractorConfig()
    logger.debug(
        "SyncTextExtractor initialized with config: %s", self.config.__dict__
    )
extract
extract(source, filename=None, config=None)

Extract text synchronously from file path or bytes.

Parameters:

Name Type Description Default
source
Union[Path, str, bytes]

File path (Path/str) or file bytes

required
filename
Optional[str]

Required if source is bytes, optional for file paths

None
config
Optional[dict]

Optional configuration overrides

None

Returns:

Name Type Description
str str

Extracted text.

Raises:

Type Description
ValueError

If filename is missing when source is bytes

FileTypeNotSupportedError

If the file extension is not supported.

ExtractionError

If extraction fails.

InvalidFileError

If the file is invalid or corrupted.

Source code in textxtract/sync/extractor.py
def extract(
    self,
    source: Union[Path, str, bytes],
    filename: Optional[str] = None,
    config: Optional[dict] = None,
) -> str:
    """
    Extract text synchronously from file path or bytes.

    Args:
        source: File path (Path/str) or file bytes
        filename: Required if source is bytes, optional for file paths
        config: Optional configuration overrides

    Returns:
        str: Extracted text.

    Raises:
        ValueError: If filename is missing when source is bytes
        FileTypeNotSupportedError: If the file extension is not supported.
        ExtractionError: If extraction fails.
        InvalidFileError: If the file is invalid or corrupted.
    """
    # Get file info for logging
    file_info = get_file_info(source, filename)
    logger.debug("Processing file: %s", file_info)

    # Prepare file path (create temp file if needed)
    file_path, temp_path = self._prepare_file_path(source, filename, config)

    try:
        # Validate file extension
        suffix = file_info.extension
        if not suffix:
            raise FileTypeNotSupportedError(
                f"File has no extension: {file_info.filename}"
            )

        logger.debug("Detected file suffix: %s", suffix)

        # Get handler
        handler = registry.get_handler(suffix)
        handler_name = handler.__class__.__name__

        logger.info(
            "Using handler %s for file %s (size: %s MB, temp: %s)",
            handler_name,
            file_info.filename,
            file_info.size_mb,
            file_info.is_temp,
        )

        # Extract text
        try:
            result = handler.extract(file_path, config or self.config.__dict__)
        except Exception as e:
            logger.error(
                "Extraction failed for file %s (handler: %s): %s",
                file_info.filename,
                handler_name,
                e,
            )

            # Re-raise custom extraction errors
            if isinstance(e, ExtractionError):
                raise
            # Wrap known invalid file errors
            if isinstance(e, (ValueError, OSError)):
                raise InvalidFileError(
                    f"Invalid file: {file_info.filename} (handler: {handler_name}, error: {e})"
                ) from e
            # Wrap as general extraction error
            raise ExtractionError(
                f"Extraction failed for file {file_info.filename} using {handler_name}: {e}"
            ) from e

        logger.info(
            "Extraction successful for file %s (extracted %d characters)",
            file_info.filename,
            len(result),
        )
        return result

    finally:
        # Clean up temporary file if created
        if temp_path:
            safe_unlink(temp_path)
            logger.debug("Temporary file %s deleted", temp_path)

Functions