from __future__ import annotations
import pathlib
from collections.abc import Generator
from typing import cast
from pdfnaut.objects.actions import Action, action_into
from pdfnaut.objects.destinations import Destination, DestType, NamedDestination
from pdfnaut.objects.page_labels import PageLabelManager
from .common import metadata
from .common.metadata import MetadataCopyDirection
from .cos.helpers import ensure, is_null_like
from .cos.objects import (
PdfArray,
PdfDictionary,
PdfHexString,
PdfName,
PdfReference,
PdfStream,
)
from .cos.objects.base import PdfObject, encode_text_string, parse_text_string
from .cos.objects.xref import FreeXRefEntry, InUseXRefEntry, PdfXRefEntry
from .cos.parser import PdfParser, PermsAcquired
from .cos.serializer import PdfSerializer
from .objects.catalog import (
ExtensionMap,
MarkInfo,
PageLayout,
PageMode,
UserAccessPermissions,
ViewerPreferences,
)
from .objects.outlines import OutlineTree
from .objects.page import Page
from .objects.trailer import Info
from .objects.xmp import XmpMetadata
from .page_list import PageList, flatten_pages
[docs]
class PdfDocument(PdfParser):
"""A PDF document that can be read and written to.
In essence, it is a high-level wrapper around :class:`~.PdfParser` intended for
PDF users who want to work with a document via high-level interfaces.
"""
[docs]
@classmethod
def from_filename(cls, path: str | pathlib.Path, *, strict: bool = False) -> PdfDocument:
"""Loads a PDF document from a file ``path``."""
with open(path, "rb") as fp:
return PdfDocument(fp.read(), strict=strict)
[docs]
@classmethod
def new(cls) -> PdfDocument:
"""Creates a blank PDF document."""
builder = PdfSerializer()
builder.write_header("2.0")
objects: dict[tuple[int, int], PdfObject] = {
(1, 0): PdfDictionary({"Type": PdfName(b"Catalog"), "Pages": PdfReference(2, 0)}),
(2, 0): PdfDictionary({"Type": PdfName(b"Pages"), "Kids": PdfArray(), "Count": 0}),
}
section: list[tuple[int, PdfXRefEntry]] = [(0, FreeXRefEntry(0, 65535))]
for (obj_num, gen_num), item in objects.items():
offset = builder.write_object((obj_num, gen_num), item)
section.append((obj_num, InUseXRefEntry(offset, gen_num)))
subsections = builder.generate_xref_section(section)
startxref = builder.write_standard_xref_section(subsections)
builder.write_trailer(
PdfDictionary({"Size": subsections[0].count, "Root": PdfReference(1, 0)}), startxref
)
builder.write_eof()
return PdfDocument(builder.content.getvalue())
[docs]
def __init__(self, data: bytes, *, strict: bool = False) -> None:
super().__init__(data, strict=strict)
self.parse()
self.access_level = PermsAcquired.OWNER
"""The current access level of the document. It may be either of the values in
:class:`.PermsAcquired`:
- Owner (2): Full access to the document. If the document is not encrypted, \
this is the default value.
- User (1): Access to the document under restrictions.
- None (0): Document is currently encrypted.
"""
# files under permissions usually use an empty string as a password
if self.has_encryption:
self.access_level = self.decrypt("")
self._page_list: PageList | None = None
@property
def has_encryption(self) -> bool:
"""Whether this document includes encryption."""
return not is_null_like(self.trailer.get("Encrypt"))
@property
def catalog(self) -> PdfDictionary:
"""The document catalog representing the root of the document's object
hierarchy, including references to the page tree, outlines, destinations,
and other core elements in a PDF document.
For details on the contents of the document catalog, see ISO 32000-2:2020
§ 7.7.2 "Document catalog dictionary".
"""
return cast(PdfDictionary, self.trailer["Root"])
@catalog.setter
def catalog(self, value: PdfDictionary) -> None:
root_ref = cast(PdfReference, self.trailer.data["Root"])
self.objects[root_ref.object_number] = value
@property
def doc_info(self) -> Info | None:
"""The ``Info`` entry of the document trailer which includes the document-level
information described in ISO 32000-2:2020 § 14.3.3 "Document information dictionary".
Some documents may specify a metadata stream rather than a DocInfo dictionary.
Such metadata can be accessed using :attr:`.PdfDocument.xmp_info`.
PDF 2.0 deprecated all keys of the DocInfo dictionary except for ``CreationDate``
and ``ModDate``.
"""
info = self.trailer.get("Info")
if is_null_like(info):
return
return Info.from_dict(cast(PdfDictionary, info))
@doc_info.setter
def doc_info(self, value: Info | None) -> None:
self._set_dict_attribute(self.trailer, "Info", value)
@property
def pdf_version(self) -> str:
"""The version of the PDF standard implemented by this document.
For obtaining the PDF version, the ``/Version`` entry in the catalog
is checked. If no such key is present, the version specified in the
header is returned. If both are present, the version returned is the
latest specified according to lexicographical comparison.
"""
header_version = self.header_version
catalog_version = cast("PdfName | None", self.catalog.get("Version"))
if not catalog_version:
return header_version
return max((header_version, catalog_version.value.decode()))
@property
def xmp_info(self) -> XmpMetadata | None:
"""The ``/Metadata`` entry of the document catalog which includes
document-level metadata stored as XMP."""
metadata = self.catalog.get("Metadata")
if is_null_like(metadata):
return
return XmpMetadata(cast(PdfStream, metadata))
@xmp_info.setter
def xmp_info(self, xmp: XmpMetadata | None) -> None:
metadata_ref = cast("PdfReference | None", self.catalog.data.get("Metadata"))
if is_null_like(metadata_ref) and xmp is not None:
# A new metadata object will be created
self.catalog["Metadata"] = self.objects.add(xmp.stream)
elif metadata_ref and isinstance(xmp, XmpMetadata):
# A metadata object will be set
self.objects[metadata_ref.object_number] = xmp.stream
elif metadata_ref:
# A metadata object will be removed
self.objects.delete(metadata_ref.object_number)
self.catalog.pop("Metadata", None)
@property
def page_tree(self) -> PdfDictionary:
"""The document's page tree described in ISO 32000-2:2020 § 7.7.3 "Page Tree".
:attr:`.PdfDocument.pages` should be preferred in typical usage.
"""
return cast(PdfDictionary, self.catalog["Pages"])
@property
def outline_tree(self) -> PdfDictionary | None:
"""The document's outline tree including what is commonly referred to as
bookmarks. See ISO 32000-2:2020 § 12.3.3 "Document outline" for details."""
outlines = self.catalog.get("Outlines")
if is_null_like(outlines):
return
return cast("PdfDictionary | None", outlines)
@property
def outline(self) -> OutlineTree | None:
"""The outline tree including a hierarchy of outline items or bookmarks used
for document-level navigation."""
outlines = self.catalog.get("Outlines")
if is_null_like(outlines):
return
outline = cast(PdfDictionary, self.catalog["Outlines"])
outline_ref = cast(PdfReference, self.catalog.data["Outlines"])
return OutlineTree(self, outline, outline_ref)
@outline.deleter
def outline(self) -> None:
if self.outline is None:
return
self.outline.children.clear()
del self.catalog["Outlines"]
[docs]
def new_outline(self) -> None:
"""Creates an empty outline tree."""
outline = PdfDictionary[str, PdfObject]({"Type": PdfName(b"Outlines")})
outline_ref = self.objects.add(outline)
self.catalog["Outlines"] = outline_ref
[docs]
def decrypt(self, password: str) -> PermsAcquired:
self.access_level = super().decrypt(password)
return self.access_level
@property
def flattened_pages(self) -> Generator[Page, None, None]:
"""A generator suitable for iterating over the pages of a PDF."""
return flatten_pages(self.page_tree, pdf=self)
@property
def page_layout(self) -> PageLayout:
"""The page layout to use when opening the document. May be one of the following
values:
- SinglePage: Display one page at a time (default).
- OneColumn: Display the pages in one column.
- TwoColumnLeft: Display the pages in two columns, with odd-numbered pages
on the left.
- TwoColumnRight: Display the pages in two columns, with odd-numbered pages
on the right.
- TwoPageLeft: Display the pages two at a time, with odd-numbered
pages on the left (PDF 1.5).
- TwoPageRight: Display the pages two at a time, with odd-numbered
pages on the right (PDF 1.5).
"""
page_layout = self.catalog.get("PageLayout")
if is_null_like(page_layout):
return "SinglePage"
layout_name = cast(PdfName, page_layout).value.decode()
return cast(PageLayout, layout_name)
@page_layout.setter
def page_layout(self, layout: PageLayout) -> None:
self.catalog["PageLayout"] = PdfName(layout.encode())
@property
def page_mode(self) -> PageMode:
"""Value specifying how the document shall be displayed when opened:
- UseNone: Neither document outline nor thumbnail images visible (default).
- UseOutlines: Document outline visible.
- UseThumbs: Thumbnail images visible.
- FullScreen: Full-screen mode, with no menu bar, window controls, or any
other window visible.
- UseOC: Optional content group panel visible (PDF 1.5).
- UseAttachments: Attachments panel visible (PDF 1.6).
"""
page_mode = self.catalog.get("PageMode")
if is_null_like(page_mode):
return "UseNone"
mode_name = cast(PdfName, page_mode).value.decode()
return cast(PageMode, mode_name)
@page_mode.setter
def page_mode(self, mode: PageMode) -> None:
self.catalog["PageMode"] = PdfName(mode.encode())
@property
def language(self) -> str | None:
"""A language identifier that shall specify the natural language for all text in
the document except where overridden by language specifications for structure
elements or marked content.
See ISO 32000-2:2020 § 14.9.2 "Natural language specification" for details.
If this entry is absent or invalid, the language shall be considered unknown.
"""
lang = self.catalog.get("Lang")
if is_null_like(lang):
return
return parse_text_string(cast("PdfHexString | bytes", lang))
@language.setter
def language(self, text: str) -> None:
self.catalog["Lang"] = encode_text_string(text)
@property
def access_permissions(self) -> UserAccessPermissions | None:
"""User access permissions relating to the document if any.
See :class:`.UserAccessPermissions` for details.
"""
if not self.has_encryption:
return
encrypt_dict = cast(PdfDictionary, self.trailer["Encrypt"])
if not is_null_like(perms := encrypt_dict.get("P")):
return UserAccessPermissions(perms)
@property
def pages(self) -> PageList:
"""The page list in the document."""
if not self.access_level:
raise PermissionError("cannot read pages of encrypted document.")
if self._page_list is None:
self._page_list = PageList(
self, self.page_tree, cast(PdfReference, self.catalog.data["Pages"])
)
return self._page_list
@property
def viewer_preferences(self) -> ViewerPreferences | None:
"""Settings controlling how a PDF reader shall display a document
on the screen. If this value is absent, the PDF reader should choose
its own default preferences.
See :class:`.ViewerPreferences` for details.
"""
viewer_prefs = self.catalog.get("ViewerPreferences")
if is_null_like(viewer_prefs):
return
return ViewerPreferences.from_dict(cast(PdfDictionary, viewer_prefs))
@viewer_preferences.setter
def viewer_preferences(self, value: ViewerPreferences | None) -> None:
self._set_dict_attribute(self.catalog, "ViewerPreferences", value)
@property
def extensions(self) -> ExtensionMap | None:
"""Developer-defined extensions to this document. This feature was introduced
in ISO 32000-1 (PDF 1.7). See :class:`.ExtensionMap` for details."""
extensions = self.catalog.get("Extensions")
if is_null_like(extensions):
return
return ExtensionMap.from_dict(cast(PdfDictionary, extensions))
@property
def mark_info(self) -> MarkInfo | None:
"""Information pertaining to the document's conformance to tagged PDF conventions.
See :class:`.MarkInfo` for details.
"""
mark_info = self.catalog.get("MarkInfo")
if is_null_like(mark_info):
return
return MarkInfo.from_dict(cast(PdfDictionary, mark_info))
@property
def page_labels(self) -> PageLabelManager:
"""The page labels for this document, if any."""
page_labels = self.catalog.get("PageLabels")
if is_null_like(page_labels):
page_labels = PdfDictionary()
page_labels = ensure(page_labels, PdfDictionary)
return PageLabelManager(page_labels, pdf=self)
@page_labels.deleter
def page_labels(self) -> None:
self.catalog.pop("PageLabels", None)
@property
def open_action(self) -> DestType | Action | None:
"""The destination or action that shall be displayed or performed when
the document is opened."""
dest_or_action = self.catalog.get("OpenAction")
if is_null_like(dest_or_action):
return
if isinstance(dest_or_action, PdfArray):
return Destination(dest_or_action)
elif isinstance(dest_or_action, PdfDictionary):
return action_into(dest_or_action)
return cast(NamedDestination, dest_or_action)
@open_action.setter
def open_action(self, action: DestType | Action | None) -> None:
self._set_dict_attribute(self.catalog, "OpenAction", action)
def _set_dict_attribute(
self, dest: PdfDictionary, key: str, value: PdfObject | None, indirect: bool = True
) -> None:
current_value = dest.data.get(key)
if value is None:
dest.data.pop(key, None)
return
if isinstance(value, PdfReference):
dest.data[key] = value
return
if indirect and isinstance(current_value, PdfReference):
self.objects[current_value.object_number] = value
elif indirect:
reference = self.objects.add(value)
dest.data[key] = reference
else:
dest.data[key] = value