Skip to content

Docx Module

DOCX file handler for text extraction.

Classes:

Name Description
DOCXHandler

Handler for extracting text from DOCX files.

Classes

DOCXHandler

Bases: FileTypeHandler

Handler for extracting text from DOCX files.

Methods:

Name Description
extract
extract_async
Source code in textxtract/handlers/docx.py
class DOCXHandler(FileTypeHandler):
    """Handler for extracting text from DOCX files."""

    def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
        try:
            from docx import Document

            doc = Document(file_path)
            return "\n".join(paragraph.text for paragraph in doc.paragraphs)
        except Exception as e:
            raise ExtractionError(f"DOCX extraction failed: {e}")

    async def extract_async(
        self, file_path: Path, config: Optional[dict] = None
    ) -> str:
        import asyncio

        return await asyncio.to_thread(self.extract, file_path, config)

Functions

extract
extract(file_path, config=None)
Source code in textxtract/handlers/docx.py
def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
    try:
        from docx import Document

        doc = Document(file_path)
        return "\n".join(paragraph.text for paragraph in doc.paragraphs)
    except Exception as e:
        raise ExtractionError(f"DOCX extraction failed: {e}")
extract_async async
extract_async(file_path, config=None)
Source code in textxtract/handlers/docx.py
async def extract_async(
    self, file_path: Path, config: Optional[dict] = None
) -> str:
    import asyncio

    return await asyncio.to_thread(self.extract, file_path, config)