Source code for pdfnaut.cos.objects.base
from __future__ import annotations
from binascii import hexlify, unhexlify
from codecs import BOM_UTF8, BOM_UTF16_BE
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Generic, Union, cast
from typing_extensions import Self, TypeVar
from ...exceptions import PdfResolutionError
if TYPE_CHECKING:
from .containers import PdfArray, PdfDictionary
from .stream import PdfStream
T = TypeVar("T", default=bytes)
[docs]
class PdfNull:
"""A PDF 'null' object, distinct from all other PDF objects (see ISO 32000-2:2020
§ 7.3.9 "Null Object")."""
def __repr__(self) -> str:
return "null"
def __bool__(self) -> bool:
return False
[docs]
@dataclass(order=True)
class PdfName(Generic[T]):
"""An atomic symbol uniquely defined by a sequence of 8-bit characters
(see ISO 32000-2:2020 § 7.3.5 "Name Objects")."""
value: T
"""The value of this name."""
def __hash__(self) -> int:
return hash((self.__class__, self.value))
[docs]
@dataclass(order=True)
class PdfHexString:
"""A string of characters encoded in hexadecimal useful for including arbitrary
binary data in a PDF (see ISO 32000-2:2020 § 7.3.4.3 "Hexadecimal Strings")."""
raw: bytes
"""The hex value of the string."""
[docs]
@classmethod
def from_raw(cls, data: bytes) -> Self:
"""Creates a hexadecimal string from ``data``."""
return cls(hexlify(data))
@property
def value(self) -> bytes:
"""The decoded value of the hex string."""
return unhexlify(self.raw)
def __hash__(self) -> int:
return hash((self.__class__, self.raw))
T = TypeVar("T")
[docs]
@dataclass
class PdfReference(Generic[T]):
"""A reference to a PDF indirect object (see ISO 32000-2:2020 § 7.3.10 "Indirect objects")."""
object_number: int
"""The object number of the object being referenced."""
generation: int
"""The generation of the object being referenced."""
def __post_init__(self) -> None:
self._resolver: ObjectGetter | None = None
[docs]
def with_resolver(self, resolver: ObjectGetter) -> Self:
"""Sets a resolution method ``resolver`` for this reference."""
self._resolver = resolver
return self
[docs]
def get(self) -> T:
"""Returns the object this reference points to. If unable to resolve,
returns :exc:`.PdfResolutionError`"""
if self._resolver:
return self._resolver(self)
raise PdfResolutionError("No resolution method available.")
def __hash__(self) -> int:
return hash((self.__class__, self.object_number, self.generation))
def __str__(self) -> str:
return f"{self.object_number} {self.generation} R"
[docs]
@dataclass
class PdfOperator:
"""A PDF operator within a content stream (see ISO 32000-2:2020 § 7.8.2 "Content streams")."""
name: bytes
"""The name of this operator."""
args: list[PdfObject] | list[PdfInlineImage]
"""The arguments or operands provided to this operator."""
# TODO: convert this into a PdfStream-like class
[docs]
@dataclass
class PdfInlineImage:
"""A PDF inline image within a content stream (see ISO 32000-2:2020 § 8.9.7 "Inline images")."""
details: PdfDictionary
"""Details about the inline image."""
raw: bytes = field(repr=False)
"""The raw contents of the inline image."""
[docs]
def parse_text_string(encoded: PdfHexString | bytes) -> str:
"""Parses a text string as described in ISO 32000-2:2020 § 7.9.2.2 "Text string type".
Text strings may either be encoded in PDFDocEncoding, UTF-16BE, or (PDF 2.0) UTF-8.
Each encoding is indicated by a byte-order mark at the beginning (``FE FF`` for
UTF-16BE and ``EF BB BF`` for UTF-8). PDFDocEncoded strings have no such mark.
"""
value = cast(bytes, encoded.value if isinstance(encoded, PdfHexString) else encoded)
if value.startswith(BOM_UTF16_BE):
return value.decode("utf-16")
elif value.startswith(BOM_UTF8):
return value.decode("utf-8")
return value.decode("pdfdoc")
[docs]
def encode_text_string(text: str, *, utf8: bool = False) -> bytes:
"""Encodes a text string to either PDFDocEncoding or UTF-16BE. Strings are encoded
with PDFDoc first then UTF-16BE if ``text`` cannot be encoded with PDFDoc.
If ``utf8`` is True, ``text`` will be encoded in UTF-8 as fallback instead of UTF-16BE.
Note that UTF-8 text strings are a PDF 2.0 feature which may not be supported by all
PDF processors.
"""
try:
return text.encode("pdfdoc")
except UnicodeEncodeError:
if utf8:
return BOM_UTF8 + text.encode("utf-8")
return BOM_UTF16_BE + text.encode("utf-16be")
PdfObject = Union[
bool,
int,
float,
bytes,
"PdfArray",
"PdfDictionary",
"PdfStream",
PdfHexString,
PdfName,
PdfReference,
PdfNull,
]
ObjectGetter = Callable[[PdfReference], T]