Bases: FileTypeHandler
Handler for extracting text from XML files.
Methods:
Source code in textxtract/handlers/xml.py
| class XMLHandler(FileTypeHandler):
"""Handler for extracting text from XML files."""
def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
try:
try:
from lxml import etree
except ImportError:
raise ExtractionError(
"lxml package is not installed. Install with 'pip install text-extractor[xml]'"
)
encoding = (config or {}).get("encoding", "utf-8")
with open(file_path, "r", encoding=encoding) as f:
tree = etree.parse(f)
return " ".join(tree.xpath("//text()"))
except Exception as e:
raise ExtractionError(f"XML extraction failed: {e}")
async def extract_async(
self, file_path: Path, config: Optional[dict] = None
) -> str:
import asyncio
return await asyncio.to_thread(self.extract, file_path, config)
|
Functions
extract
extract(file_path, config=None)
Source code in textxtract/handlers/xml.py
| def extract(self, file_path: Path, config: Optional[dict] = None) -> str:
try:
try:
from lxml import etree
except ImportError:
raise ExtractionError(
"lxml package is not installed. Install with 'pip install text-extractor[xml]'"
)
encoding = (config or {}).get("encoding", "utf-8")
with open(file_path, "r", encoding=encoding) as f:
tree = etree.parse(f)
return " ".join(tree.xpath("//text()"))
except Exception as e:
raise ExtractionError(f"XML extraction failed: {e}")
|
extract_async
async
extract_async(file_path, config=None)
Source code in textxtract/handlers/xml.py
| async def extract_async(
self, file_path: Path, config: Optional[dict] = None
) -> str:
import asyncio
return await asyncio.to_thread(self.extract, file_path, config)
|