Source code for pdfnaut.cos.objects.base

from __future__ import annotations

from binascii import hexlify, unhexlify
from codecs import BOM_UTF8, BOM_UTF16_BE
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Generic, Union, cast

from typing_extensions import Self, TypeVar

from ...exceptions import PdfResolutionError

if TYPE_CHECKING:
    from .containers import PdfArray, PdfDictionary
    from .stream import PdfStream

T = TypeVar("T", default=bytes)



[docs]
class PdfNull:
    """A PDF 'null' object, distinct from all other PDF objects (see ISO 32000-2:2020
    § 7.3.9 "Null Object")."""

    def __repr__(self) -> str:
        return "null"

    def __bool__(self) -> bool:
        return False




[docs]
@dataclass
class PdfComment:
    """A comment introduced by the presence of the percent sign (``%``) outside a string or
    inside a content stream. Comments have no syntactical meaning and shall be interpreted as
    whitespace (see ISO 32000-2:2020 § 7.2.4 "Comments")."""

    value: bytes
    """The value of this comment."""




[docs]
@dataclass(order=True)
class PdfName(Generic[T]):
    """An atomic symbol uniquely defined by a sequence of 8-bit characters
    (see ISO 32000-2:2020 § 7.3.5 "Name Objects")."""

    value: T
    """The value of this name."""

    def __hash__(self) -> int:
        return hash((self.__class__, self.value))




[docs]
@dataclass(order=True)
class PdfHexString:
    """A string of characters encoded in hexadecimal useful for including arbitrary
    binary data in a PDF (see ISO 32000-2:2020 § 7.3.4.3 "Hexadecimal Strings")."""

    raw: bytes
    """The hex value of the string."""


[docs]
    @classmethod
    def from_raw(cls, data: bytes) -> Self:
        """Creates a hexadecimal string from ``data``."""
        return cls(hexlify(data))


    @property
    def value(self) -> bytes:
        """The decoded value of the hex string."""
        return unhexlify(self.raw)

    def __hash__(self) -> int:
        return hash((self.__class__, self.raw))



T = TypeVar("T")



[docs]
@dataclass
class PdfReference(Generic[T]):
    """A reference to a PDF indirect object (see ISO 32000-2:2020 § 7.3.10 "Indirect objects")."""

    object_number: int
    """The object number of the object being referenced."""

    generation: int
    """The generation of the object being referenced."""

    def __post_init__(self) -> None:
        self._resolver: ObjectGetter | None = None


[docs]
    def with_resolver(self, resolver: ObjectGetter) -> Self:
        """Sets a resolution method ``resolver`` for this reference."""
        self._resolver = resolver
        return self



[docs]
    def get(self) -> T:
        """Returns the object this reference points to. If unable to resolve,
        returns :exc:`.PdfResolutionError`"""
        if self._resolver:
            return self._resolver(self)

        raise PdfResolutionError("No resolution method available.")


    def __hash__(self) -> int:
        return hash((self.__class__, self.object_number, self.generation))

    def __str__(self) -> str:
        return f"{self.object_number} {self.generation} R"




[docs]
@dataclass
class PdfOperator:
    """A PDF operator within a content stream (see ISO 32000-2:2020 § 7.8.2 "Content streams")."""

    name: bytes
    """The name of this operator."""

    args: list[PdfObject] | list[PdfInlineImage]
    """The arguments or operands provided to this operator."""



# TODO: convert this into a PdfStream-like class

[docs]
@dataclass
class PdfInlineImage:
    """A PDF inline image within a content stream (see ISO 32000-2:2020 § 8.9.7 "Inline images")."""

    details: PdfDictionary
    """Details about the inline image."""

    raw: bytes = field(repr=False)
    """The raw contents of the inline image."""




[docs]
def parse_text_string(encoded: PdfHexString | bytes) -> str:
    """Parses a text string as described in ISO 32000-2:2020 § 7.9.2.2 "Text string type".

    Text strings may either be encoded in PDFDocEncoding, UTF-16BE, or (PDF 2.0) UTF-8.
    Each encoding is indicated by a byte-order mark at the beginning (``FE FF`` for
    UTF-16BE and ``EF BB BF`` for UTF-8). PDFDocEncoded strings have no such mark.
    """
    value = cast(bytes, encoded.value if isinstance(encoded, PdfHexString) else encoded)

    if value.startswith(BOM_UTF16_BE):
        return value.decode("utf-16")
    elif value.startswith(BOM_UTF8):
        return value.decode("utf-8")

    return value.decode("pdfdoc")




[docs]
def encode_text_string(text: str, *, utf8: bool = False) -> bytes:
    """Encodes a text string to either PDFDocEncoding or UTF-16BE. Strings are encoded
    with PDFDoc first then UTF-16BE if ``text`` cannot be encoded with PDFDoc.

    If ``utf8`` is True, ``text`` will be encoded in UTF-8 as fallback instead of UTF-16BE.
    Note that UTF-8 text strings are a PDF 2.0 feature which may not be supported by all
    PDF processors.
    """
    try:
        return text.encode("pdfdoc")
    except UnicodeEncodeError:
        if utf8:
            return BOM_UTF8 + text.encode("utf-8")

        return BOM_UTF16_BE + text.encode("utf-16be")



PdfObject = Union[
    bool,
    int,
    float,
    bytes,
    "PdfArray",
    "PdfDictionary",
    "PdfStream",
    PdfHexString,
    PdfName,
    PdfReference,
    PdfNull,
]
ObjectGetter = Callable[[PdfReference], T]