Source code for pdfnaut.page_list

from __future__ import annotations

import sys
from collections.abc import Generator, Iterable, MutableSequence
from typing import Any, Iterator, cast, overload

from typing_extensions import Self

from .common.copying import clone_into_document, copy_object
from .cos.objects import PdfArray, PdfDictionary, PdfName, PdfReference
from .cos.parser import PdfParser
from .objects.page import Page


[docs] def flatten_pages( root: PdfDictionary, *, pdf: PdfParser | None = None ) -> Generator[Page, None, None]: """Yields all :class:`.Page` objects within ``root`` and its descendants.""" kids = cast(PdfArray, root["Kids"]) for page_ref in cast(list[PdfReference], kids.data): page = cast(PdfDictionary, page_ref.get()) type_ = cast(PdfName, page["Type"]) if type_.value == b"Pages": yield from flatten_pages(page, pdf=pdf) elif type_.value == b"Page": yield Page.from_dict(page, pdf=pdf, indirect_ref=page_ref)
[docs] class PageList(MutableSequence[Page]): """A mutable sequence representing the the pages in a document. .. warning:: This class isn't designed to be constructed by a user. To access the page list of a PDF, use :attr:`.PdfDocument.pages`. """
[docs] def __init__( self, pdf: PdfParser, root_tree: PdfDictionary, root_tree_ref: PdfReference, ) -> None: self._pdf = pdf self._root_tree = root_tree self._root_tree_ref = root_tree_ref self._indexed_page_cache = list(flatten_pages(self._root_tree, pdf=self._pdf)) self._last_hash = hash(self._root_tree)
def _update_on_hash(self) -> None: # process: if the page tree has changed, only replace the pages # in the indexed page cache that have also changed. if self._last_hash == hash(self._root_tree): return page_list: list[Page] = [] for idx, page in enumerate(flatten_pages(self._root_tree, pdf=self._pdf)): if 0 <= idx < len(self._indexed_page_cache): # page in list, check if it is different. prev_page = self._indexed_page_cache[idx] if hash(prev_page) != hash(page): page_list.append(page) else: page_list.append(prev_page) else: # page not in list, simply append. page_list.append(page) self._last_hash = hash(self._root_tree) self._indexed_page_cache = page_list def _get_indexed_pages(self) -> list[Page]: self._update_on_hash() return self._indexed_page_cache def __repr__(self) -> str: return repr(self._indexed_page_cache) # * mutable sequence methods def __len__(self) -> int: return len(self._get_indexed_pages()) def __contains__(self, value: object) -> bool: return value in self._get_indexed_pages() def __iter__(self) -> Iterator[Page]: return iter(self._get_indexed_pages()) def __reversed__(self) -> Iterator[Page]: return reversed(self._get_indexed_pages()) @overload def __getitem__(self, index: int) -> Page: ... @overload def __getitem__(self, index: slice) -> list[Page]: ... def __getitem__(self, index: int | slice) -> Page | list[Page]: return self._get_indexed_pages()[index] @overload def __setitem__(self, index: int, value: Page) -> None: ... @overload def __setitem__(self, index: slice, value: Iterable[Page]) -> None: ... def __setitem__(self, index: int | slice, value: Page | Iterable[Page]) -> None: if isinstance(index, slice): raise NotImplementedError assert isinstance(value, Page) result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index) if result is None: raise IndexError("page tree assignment index out of range") tree, _, tree_idx = result value = self._add_page_to_obj_store(value) # delete the page being replaced from the object store replacing_ref = tree["Kids"].data[tree_idx] self._pdf.objects.delete(replacing_ref.object_number) self._indexed_page_cache[index].indirect_ref = None self._indexed_page_cache[index].pdf = None # set the page tree["Kids"][tree_idx] = value.indirect_ref self._indexed_page_cache[index] = value def __delitem__(self, index: int | slice) -> None: if isinstance(index, slice): raise NotImplementedError self.pop(index) def __iadd__(self, values: Iterable[Page]) -> Self: self.extend(values) return self
[docs] def index(self, value: Any, start: int = 0, stop: int = sys.maxsize) -> int: """Returns the index at which page ``value`` was first found in the range of ``start`` included to ``stop`` excluded.""" return self._get_indexed_pages().index(value, start, stop)
[docs] def count(self, value: Any) -> int: """Returns the amount of times page ``value`` appears in the page list. This method should in practice always return either 0 (the page is not present) or 1 (the page is present). This method is provided for compatibility with functions expecting mutable sequences. """ return self._get_indexed_pages().count(value)
[docs] def insert(self, index: int, value: Page) -> None: """Inserts a page ``value`` at ``index``. ``index`` is the index of the page before which to insert. When inserting, the page object is copied into the page list. The object identity of the output shall match the identity of the input page. The input page shall receive the indirect reference of the inserted page. .. note:: When adding a page belonging to a different document, the page will likely refer to resources that are part of the document such as fonts, images, and annotations. Some of these resources cannot be reliably copied and so it is possible that they're not added to the document, in which case, the references of such resources are simply marked null. Annotations that point to destinations not within the page will be preserved but not in working order. Form objects will not be copied at all. """ if index < -len(self): index = 0 elif index >= len(self): return self._append_pages_to_tree([value]) else: index = self._pos_idx_of(index) inserting_page = self._add_page_to_obj_store(value) if self._get_indexed_pages(): # document has pages, traverse the tree and insert at location result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index) else: # document has no pages, insert in root page tree result = (self._root_tree, self._root_tree_ref, index) # This should always be the case but, for good measure, we check it. assert result is not None, f"expected tree for index {index}." tree, tree_ref, tree_idx = result self._insert_page_into_tree(inserting_page, tree_idx, tree=tree, tree_ref=tree_ref) self._indexed_page_cache.insert(index, value)
[docs] def append(self, value: Page) -> None: """Appends a page ``value`` to the page list. If appending a page from a different document, please refer to the note in :meth:`PageList.insert` for additional considerations. """ self.insert(len(self._get_indexed_pages()), value)
[docs] def clear(self) -> None: raise NotImplementedError
[docs] def reverse(self) -> None: raise NotImplementedError
[docs] def extend(self, values: Iterable[Page]) -> None: """Appends a list of pages ``values`` into the page list. When extending, all pages will be copied and inserted into the last page tree within the page list. If any of the pages belong to a different document, please refer to the note in :meth:`PageList.insert` for additional considerations. """ self._append_pages_to_tree(values)
[docs] def pop(self, index: int = -1) -> Page: """Removes the page at ``index``. Only the page object is removed from the document and its reference is invalidated. The resources used by the page are not removed as they may be used later on in other pages. Raises: IndexError: The page list is empty or the index does not exist. Returns: Page: The page object that was popped. """ index = self._pos_idx_of(index) if self._get_indexed_pages(): # document has pages, traverse the tree and insert at location result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index) else: result = None if result is not None: tree, _, tree_idx = result else: tree = self._root_tree tree_idx = index # delete the page from the tree self._delete_page_in_tree(tree_idx, tree) output = self._indexed_page_cache.pop(index) # delete the page from the object store if output.indirect_ref is not None: self._pdf.objects.delete(output.indirect_ref.object_number) output.indirect_ref = None output.pdf = None return output
[docs] def remove(self, value: Page) -> None: """Removes the first occurrence of page ``value`` in the document. Raises: IndexError: The page list is empty or the page is not in this document. """ index = self.index(value) value.indirect_ref = None value.pdf = None self.pop(index)
# * helper methods def _pos_idx_of(self, index: int) -> int: # positive index is within 0 and len(self), both inclusive # if index < 0, index = len(self) - abs(index) if index >= 0: return min(index, len(self)) return len(self) - abs(index) def _add_page_to_obj_store(self, page: Page) -> Page: if page.indirect_ref is not None: # page has an indirect ref, assume page comes from different # document and create copy. added_page = clone_into_document(self._pdf, page, ignore_keys=["Parent"]) else: # no indirect reference, assume new page and create copy. added_page = copy_object(page) added_page = cast(PdfDictionary, added_page) added_page.pop("Parent", None) page_ref = self._pdf.objects.add(added_page) added_page = Page.from_dict(added_page, pdf=self._pdf, indirect_ref=page_ref) # only set the reference if the page has none. if page.indirect_ref is None: page.pdf = self._pdf page.indirect_ref = page_ref page.data = added_page.data return page return added_page def _insert_page_into_tree( self, page: Page, tree_index: int, *, tree: PdfDictionary, tree_ref: PdfReference ) -> None: if page.indirect_ref is None: raise ValueError("Page has no indirect reference assigned to it.") tree["Kids"].insert(tree_index, page.indirect_ref) tree["Count"] += 1 page["Parent"] = tree_ref parent = tree while (parent := parent.get("Parent")) is not None: parent["Count"] += 1 def _append_pages_to_tree(self, values: Iterable[Page]) -> None: last_tree, last_tree_ref = self._get_last_tree(self._root_tree, self._root_tree_ref) for value in values: inserting_page = self._add_page_to_obj_store(value) self._insert_page_into_tree( inserting_page, len(last_tree["Kids"]), tree=last_tree, tree_ref=last_tree_ref ) self._indexed_page_cache.insert(len(self._indexed_page_cache), value) def _delete_page_in_tree(self, tree_index: int, tree: PdfDictionary) -> None: tree.data["Kids"].pop(tree_index) tree["Count"] -= 1 parent = tree while (parent := parent.get("Parent")) is not None: parent["Count"] -= 1 def _get_last_tree( self, root: PdfDictionary, root_ref: PdfReference ) -> tuple[PdfDictionary, PdfReference]: kids = cast(PdfArray[PdfReference], root["Kids"]).data result = (root, root_ref) for page_ref in kids: page = page_ref.get() type_ = cast(PdfName, page["Type"]) if type_.value == b"Pages": result = self._get_last_tree(page, page_ref) return result def _get_tree_with_index( self, root: PdfDictionary, root_ref: PdfReference, index: int ) -> tuple[tuple[PdfDictionary, PdfReference, int] | None, int]: kids = cast(PdfArray[PdfReference], root["Kids"].data) for tree_index, page_ref in enumerate(kids): page = page_ref.get() type_ = cast(PdfName, page["Type"]) if type_.value == b"Pages": # intermediate node result, index = self._get_tree_with_index(page, page_ref, index) if result is not None: return (result, index) elif type_.value == b"Page": # page node if index <= 0: return (root, root_ref, tree_index), index index -= 1 return (None, index)