Source code for pdfnaut.page_list

from __future__ import annotations

import sys
from collections.abc import Generator, Iterable, MutableSequence
from typing import Any, Iterator, cast, overload

from typing_extensions import Self

from .common.copying import clone_into_document, copy_object
from .cos.objects import PdfArray, PdfDictionary, PdfName, PdfReference
from .cos.parser import PdfParser
from .objects.page import Page



[docs]
def flatten_pages(
    root: PdfDictionary, *, pdf: PdfParser | None = None
) -> Generator[Page, None, None]:
    """Yields all :class:`.Page` objects within ``root`` and its descendants."""

    kids = cast(PdfArray, root["Kids"])

    for page_ref in cast(list[PdfReference], kids.data):
        page = cast(PdfDictionary, page_ref.get())

        type_ = cast(PdfName, page["Type"])
        if type_.value == b"Pages":
            yield from flatten_pages(page, pdf=pdf)
        elif type_.value == b"Page":
            yield Page.from_dict(page, pdf=pdf, indirect_ref=page_ref)




[docs]
class PageList(MutableSequence[Page]):
    """A mutable sequence representing the the pages in a document.

    .. warning::
        This class isn't designed to be constructed by a user. To access the page list
        of a PDF, use :attr:`.PdfDocument.pages`.
    """


[docs]
    def __init__(
        self,
        pdf: PdfParser,
        root_tree: PdfDictionary,
        root_tree_ref: PdfReference,
    ) -> None:
        self._pdf = pdf
        self._root_tree = root_tree
        self._root_tree_ref = root_tree_ref
        self._indexed_page_cache = list(flatten_pages(self._root_tree, pdf=self._pdf))
        self._last_hash = hash(self._root_tree)


    def _update_on_hash(self) -> None:
        # process: if the page tree has changed, only replace the pages
        # in the indexed page cache that have also changed.

        if self._last_hash == hash(self._root_tree):
            return

        page_list: list[Page] = []

        for idx, page in enumerate(flatten_pages(self._root_tree, pdf=self._pdf)):
            if 0 <= idx < len(self._indexed_page_cache):
                # page in list, check if it is different.
                prev_page = self._indexed_page_cache[idx]
                if hash(prev_page) != hash(page):
                    page_list.append(page)
                else:
                    page_list.append(prev_page)
            else:
                # page not in list, simply append.
                page_list.append(page)

        self._last_hash = hash(self._root_tree)
        self._indexed_page_cache = page_list

    def _get_indexed_pages(self) -> list[Page]:
        self._update_on_hash()
        return self._indexed_page_cache

    def __repr__(self) -> str:
        return repr(self._indexed_page_cache)

    # * mutable sequence methods
    def __len__(self) -> int:
        return len(self._get_indexed_pages())

    def __contains__(self, value: object) -> bool:
        return value in self._get_indexed_pages()

    def __iter__(self) -> Iterator[Page]:
        return iter(self._get_indexed_pages())

    def __reversed__(self) -> Iterator[Page]:
        return reversed(self._get_indexed_pages())

    @overload
    def __getitem__(self, index: int) -> Page: ...

    @overload
    def __getitem__(self, index: slice) -> list[Page]: ...

    def __getitem__(self, index: int | slice) -> Page | list[Page]:
        return self._get_indexed_pages()[index]

    @overload
    def __setitem__(self, index: int, value: Page) -> None: ...

    @overload
    def __setitem__(self, index: slice, value: Iterable[Page]) -> None: ...

    def __setitem__(self, index: int | slice, value: Page | Iterable[Page]) -> None:
        if isinstance(index, slice):
            raise NotImplementedError

        assert isinstance(value, Page)

        result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index)
        if result is None:
            raise IndexError("page tree assignment index out of range")

        tree, _, tree_idx = result

        value = self._add_page_to_obj_store(value)

        # delete the page being replaced from the object store
        replacing_ref = tree["Kids"].data[tree_idx]
        self._pdf.objects.delete(replacing_ref.object_number)
        self._indexed_page_cache[index].indirect_ref = None
        self._indexed_page_cache[index].pdf = None

        # set the page
        tree["Kids"][tree_idx] = value.indirect_ref
        self._indexed_page_cache[index] = value

    def __delitem__(self, index: int | slice) -> None:
        if isinstance(index, slice):
            raise NotImplementedError

        self.pop(index)

    def __iadd__(self, values: Iterable[Page]) -> Self:
        self.extend(values)
        return self


[docs]
    def index(self, value: Any, start: int = 0, stop: int = sys.maxsize) -> int:
        """Returns the index at which page ``value`` was first found in the
        range of ``start`` included to ``stop`` excluded."""
        return self._get_indexed_pages().index(value, start, stop)



[docs]
    def count(self, value: Any) -> int:
        """Returns the amount of times page ``value`` appears in the page list.

        This method should in practice always return either 0 (the page is not present)
        or 1 (the page is present). This method is provided for compatibility
        with functions expecting mutable sequences.
        """
        return self._get_indexed_pages().count(value)



[docs]
    def insert(self, index: int, value: Page) -> None:
        """Inserts a page ``value`` at ``index``. ``index`` is the index of
        the page before which to insert.

        When inserting, the page object is copied into the page list.

        The object identity of the output shall match the identity of the input page.
        The input page shall receive the indirect reference of the inserted page.

        .. note::
            When adding a page belonging to a different document, the page will likely
            refer to resources that are part of the document such as fonts, images,
            and annotations.

            Some of these resources cannot be reliably copied and so it is possible
            that they're not added to the document, in which case, the references of
            such resources are simply marked null.

            Annotations that point to destinations not within the page will be preserved
            but not in working order. Form objects will not be copied at all.
        """
        if index < -len(self):
            index = 0
        elif index >= len(self):
            return self._append_pages_to_tree([value])
        else:
            index = self._pos_idx_of(index)

        inserting_page = self._add_page_to_obj_store(value)

        if self._get_indexed_pages():
            # document has pages, traverse the tree and insert at location
            result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index)
        else:
            # document has no pages, insert in root page tree
            result = (self._root_tree, self._root_tree_ref, index)

        # This should always be the case but, for good measure, we check it.
        assert result is not None, f"expected tree for index {index}."

        tree, tree_ref, tree_idx = result

        self._insert_page_into_tree(inserting_page, tree_idx, tree=tree, tree_ref=tree_ref)
        self._indexed_page_cache.insert(index, value)



[docs]
    def append(self, value: Page) -> None:
        """Appends a page ``value`` to the page list.

        If appending a page from a different document, please refer to the note in
        :meth:`PageList.insert` for additional considerations.
        """
        self.insert(len(self._get_indexed_pages()), value)



[docs]
    def clear(self) -> None:
        raise NotImplementedError



[docs]
    def reverse(self) -> None:
        raise NotImplementedError



[docs]
    def extend(self, values: Iterable[Page]) -> None:
        """Appends a list of pages ``values`` into the page list.

        When extending, all pages will be copied and inserted into the last page
        tree within the page list.

        If any of the pages belong to a different document, please refer to the note in
        :meth:`PageList.insert` for additional considerations.
        """
        self._append_pages_to_tree(values)



[docs]
    def pop(self, index: int = -1) -> Page:
        """Removes the page at ``index``.

        Only the page object is removed from the document and its reference is
        invalidated. The resources used by the page are not removed as they may
        be used later on in other pages.

        Raises:
            IndexError: The page list is empty or the index does not exist.

        Returns:
            Page: The page object that was popped.
        """
        index = self._pos_idx_of(index)

        if self._get_indexed_pages():
            # document has pages, traverse the tree and insert at location
            result, _ = self._get_tree_with_index(self._root_tree, self._root_tree_ref, index)
        else:
            result = None

        if result is not None:
            tree, _, tree_idx = result
        else:
            tree = self._root_tree
            tree_idx = index

        # delete the page from the tree
        self._delete_page_in_tree(tree_idx, tree)
        output = self._indexed_page_cache.pop(index)

        # delete the page from the object store
        if output.indirect_ref is not None:
            self._pdf.objects.delete(output.indirect_ref.object_number)
            output.indirect_ref = None
            output.pdf = None

        return output



[docs]
    def remove(self, value: Page) -> None:
        """Removes the first occurrence of page ``value`` in the document.

        Raises:
            IndexError: The page list is empty or the page is not in this document.
        """
        index = self.index(value)
        value.indirect_ref = None
        value.pdf = None

        self.pop(index)


    # * helper methods
    def _pos_idx_of(self, index: int) -> int:
        # positive index is within 0 and len(self), both inclusive
        # if index < 0, index = len(self) - abs(index)

        if index >= 0:
            return min(index, len(self))

        return len(self) - abs(index)

    def _add_page_to_obj_store(self, page: Page) -> Page:
        if page.indirect_ref is not None:
            # page has an indirect ref, assume page comes from different
            # document and create copy.
            added_page = clone_into_document(self._pdf, page, ignore_keys=["Parent"])
        else:
            # no indirect reference, assume new page and create copy.
            added_page = copy_object(page)

        added_page = cast(PdfDictionary, added_page)
        added_page.pop("Parent", None)

        page_ref = self._pdf.objects.add(added_page)
        added_page = Page.from_dict(added_page, pdf=self._pdf, indirect_ref=page_ref)

        # only set the reference if the page has none.
        if page.indirect_ref is None:
            page.pdf = self._pdf
            page.indirect_ref = page_ref
            page.data = added_page.data
            return page

        return added_page

    def _insert_page_into_tree(
        self, page: Page, tree_index: int, *, tree: PdfDictionary, tree_ref: PdfReference
    ) -> None:
        if page.indirect_ref is None:
            raise ValueError("Page has no indirect reference assigned to it.")

        tree["Kids"].insert(tree_index, page.indirect_ref)
        tree["Count"] += 1

        page["Parent"] = tree_ref

        parent = tree
        while (parent := parent.get("Parent")) is not None:
            parent["Count"] += 1

    def _append_pages_to_tree(self, values: Iterable[Page]) -> None:
        last_tree, last_tree_ref = self._get_last_tree(self._root_tree, self._root_tree_ref)

        for value in values:
            inserting_page = self._add_page_to_obj_store(value)

            self._insert_page_into_tree(
                inserting_page, len(last_tree["Kids"]), tree=last_tree, tree_ref=last_tree_ref
            )
            self._indexed_page_cache.insert(len(self._indexed_page_cache), value)

    def _delete_page_in_tree(self, tree_index: int, tree: PdfDictionary) -> None:
        tree.data["Kids"].pop(tree_index)

        tree["Count"] -= 1

        parent = tree
        while (parent := parent.get("Parent")) is not None:
            parent["Count"] -= 1

    def _get_last_tree(
        self, root: PdfDictionary, root_ref: PdfReference
    ) -> tuple[PdfDictionary, PdfReference]:
        kids = cast(PdfArray[PdfReference], root["Kids"]).data
        result = (root, root_ref)

        for page_ref in kids:
            page = page_ref.get()
            type_ = cast(PdfName, page["Type"])

            if type_.value == b"Pages":
                result = self._get_last_tree(page, page_ref)

        return result

    def _get_tree_with_index(
        self, root: PdfDictionary, root_ref: PdfReference, index: int
    ) -> tuple[tuple[PdfDictionary, PdfReference, int] | None, int]:
        kids = cast(PdfArray[PdfReference], root["Kids"].data)

        for tree_index, page_ref in enumerate(kids):
            page = page_ref.get()

            type_ = cast(PdfName, page["Type"])

            if type_.value == b"Pages":  # intermediate node
                result, index = self._get_tree_with_index(page, page_ref, index)
                if result is not None:
                    return (result, index)
            elif type_.value == b"Page":  # page node
                if index <= 0:
                    return (root, root_ref, tree_index), index

                index -= 1

        return (None, index)