Source code for pdfnaut.cos.objects.base

from __future__ import annotations

from binascii import hexlify, unhexlify
from codecs import BOM_UTF8, BOM_UTF16_BE
from collections.abc import Callable
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Generic, Union, cast

from typing_extensions import Self, TypeVar

from ...exceptions import PdfResolutionError

if TYPE_CHECKING:
    from .containers import PdfArray, PdfDictionary
    from .stream import PdfStream

T = TypeVar("T", default=bytes)


[docs] class PdfNull: """A PDF 'null' object, distinct from all other PDF objects (see ISO 32000-2:2020 § 7.3.9 "Null Object").""" def __repr__(self) -> str: return "null" def __bool__(self) -> bool: return False
[docs] @dataclass class PdfComment: """A comment introduced by the presence of the percent sign (``%``) outside a string or inside a content stream. Comments have no syntactical meaning and shall be interpreted as whitespace (see ISO 32000-2:2020 § 7.2.4 "Comments").""" value: bytes """The value of this comment."""
[docs] @dataclass(order=True) class PdfName(Generic[T]): """An atomic symbol uniquely defined by a sequence of 8-bit characters (see ISO 32000-2:2020 § 7.3.5 "Name Objects").""" value: T """The value of this name.""" def __hash__(self) -> int: return hash((self.__class__, self.value))
[docs] @dataclass(order=True) class PdfHexString: """A string of characters encoded in hexadecimal useful for including arbitrary binary data in a PDF (see ISO 32000-2:2020 § 7.3.4.3 "Hexadecimal Strings").""" raw: bytes """The hex value of the string."""
[docs] @classmethod def from_raw(cls, data: bytes) -> Self: """Creates a hexadecimal string from ``data``.""" return cls(hexlify(data))
@property def value(self) -> bytes: """The decoded value of the hex string.""" return unhexlify(self.raw) def __hash__(self) -> int: return hash((self.__class__, self.raw))
T = TypeVar("T")
[docs] @dataclass class PdfReference(Generic[T]): """A reference to a PDF indirect object (see ISO 32000-2:2020 § 7.3.10 "Indirect objects").""" object_number: int """The object number of the object being referenced.""" generation: int """The generation of the object being referenced.""" def __post_init__(self) -> None: self._resolver: ObjectGetter | None = None
[docs] def with_resolver(self, resolver: ObjectGetter) -> Self: """Sets a resolution method ``resolver`` for this reference.""" self._resolver = resolver return self
[docs] def get(self) -> T: """Returns the object this reference points to. If unable to resolve, returns :exc:`.PdfResolutionError`""" if self._resolver: return self._resolver(self) raise PdfResolutionError("No resolution method available.")
def __hash__(self) -> int: return hash((self.__class__, self.object_number, self.generation)) def __str__(self) -> str: return f"{self.object_number} {self.generation} R"
[docs] @dataclass class PdfOperator: """A PDF operator within a content stream (see ISO 32000-2:2020 § 7.8.2 "Content streams").""" name: bytes """The name of this operator.""" args: list[PdfObject] | list[PdfInlineImage] """The arguments or operands provided to this operator."""
# TODO: convert this into a PdfStream-like class
[docs] @dataclass class PdfInlineImage: """A PDF inline image within a content stream (see ISO 32000-2:2020 § 8.9.7 "Inline images").""" details: PdfDictionary """Details about the inline image.""" raw: bytes = field(repr=False) """The raw contents of the inline image."""
[docs] def parse_text_string(encoded: PdfHexString | bytes) -> str: """Parses a text string as described in ISO 32000-2:2020 § 7.9.2.2 "Text string type". Text strings may either be encoded in PDFDocEncoding, UTF-16BE, or (PDF 2.0) UTF-8. Each encoding is indicated by a byte-order mark at the beginning (``FE FF`` for UTF-16BE and ``EF BB BF`` for UTF-8). PDFDocEncoded strings have no such mark. """ value = cast(bytes, encoded.value if isinstance(encoded, PdfHexString) else encoded) if value.startswith(BOM_UTF16_BE): return value.decode("utf-16") elif value.startswith(BOM_UTF8): return value.decode("utf-8") return value.decode("pdfdoc")
[docs] def encode_text_string(text: str, *, utf8: bool = False) -> bytes: """Encodes a text string to either PDFDocEncoding or UTF-16BE. Strings are encoded with PDFDoc first then UTF-16BE if ``text`` cannot be encoded with PDFDoc. If ``utf8`` is True, ``text`` will be encoded in UTF-8 as fallback instead of UTF-16BE. Note that UTF-8 text strings are a PDF 2.0 feature which may not be supported by all PDF processors. """ try: return text.encode("pdfdoc") except UnicodeEncodeError: if utf8: return BOM_UTF8 + text.encode("utf-8") return BOM_UTF16_BE + text.encode("utf-16be")
PdfObject = Union[ bool, int, float, bytes, "PdfArray", "PdfDictionary", "PdfStream", PdfHexString, PdfName, PdfReference, PdfNull, ] ObjectGetter = Callable[[PdfReference], T]