from __future__ import annotations
import datetime
import xml.dom.minidom as minidom
from textwrap import dedent
from typing import Any
from xml.parsers import expat
import pdfnaut
from pdfnaut.common.dates import encode_iso8601, parse_iso8601
from pdfnaut.cos.objects import PdfDictionary, PdfName, PdfStream
from pdfnaut.exceptions import PdfParseError
namespaces = {
"pdf": "http://ns.adobe.com/pdf/1.3/",
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"dc": "http://purl.org/dc/elements/1.1/",
"xmp": "http://ns.adobe.com/xap/1.0/",
}
[docs]
def get_full_text(element: minidom.Element) -> str:
"""Returns the full text content within ``element``."""
text_values = []
for node in element.childNodes:
if isinstance(node, minidom.Text):
text_values.append(node.data)
elif hasattr(node, "childNodes"):
text_values.append(get_full_text(node))
return "".join(text_values)
[docs]
def lookup_prefix_for_ns(node: minidom.Node, namespace: str) -> tuple[str, minidom.Node] | None:
"""Locates a namespace prefix matching the ``namespace`` URI in ``node``. Returns either
a tuple of two items containing, in order, the prefix of the namespace URI and the node
where it was found, or None, if no prefix is registered for the namespace URI.
This is an implementation of https://dom.spec.whatwg.org/#locate-a-namespace-prefix.
"""
if isinstance(node, minidom.Element):
if node.namespaceURI == namespace and node.prefix:
return (node.prefix, node)
for attrib in node.attributes.values():
if attrib.prefix == "xmlns" and attrib.value == namespace:
return (attrib.localName, node)
if node.parentNode:
return lookup_prefix_for_ns(node.parentNode, namespace)
elif isinstance(node, minidom.Document):
if node.ownerDocument is None:
return
return lookup_prefix_for_ns(node.ownerDocument, namespace)
elif isinstance(node, (minidom.DocumentFragment, minidom.DocumentType)):
return
elif isinstance(node, minidom.Attr):
if node.ownerElement is None:
return
return lookup_prefix_for_ns(node.ownerElement, namespace)
elif node.parentNode is not None:
return lookup_prefix_for_ns(node.parentNode, namespace)
[docs]
class XMPProperty:
"""An XMP property included in an XMP packet."""
[docs]
def __init__(self, namespace_uri: str, local_name: str, **extra: Any) -> None:
self.namespace_uri = namespace_uri
"""The namespace URI of this property."""
self.local_name = local_name
"""The local name of this property."""
self.extra = extra
"""Any additional property-specific values."""
def _get_xml_property(self, xmp: XmpMetadata) -> minidom.Element | None:
"""Gets the XML property element for this property on the given metadata instance."""
return xmp._xml_properties.get(self)
def _set_xml_property_cache(self, xmp: XmpMetadata, element: minidom.Element | None) -> None:
"""Sets the XML property element for this property on the given metadata instance."""
if element is None:
xmp._xml_properties.pop(self, None)
else:
xmp._xml_properties[self] = element
def _fetch_xml_property(self, xmp: XmpMetadata) -> None:
"""Retrieves the current XML property element from the XMP packet."""
if self._get_xml_property(xmp) is not None:
return
candidates = xmp.rdf_root.getElementsByTagNameNS(self.namespace_uri, self.local_name)
self._set_xml_property_cache(xmp, candidates[0] if candidates else None)
def _set_xml_property(self, xmp: XmpMetadata, nodes: list[minidom.Node]) -> None:
"""Sets the current XML property element to the items in ``node_list``."""
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
if xml_property:
# This property is present in the document.
# Simply replace the children with a new text node.
xml_property.childNodes[:] = nodes
xmp.stream.modify(xmp.packet.toprettyxml().encode())
return
# We will have to make a new property
# Let's first check the document to see if our namespace is already registered.
prefix_and_node = lookup_prefix_for_ns(xmp.rdf_root, self.namespace_uri)
if prefix_and_node is not None:
prefix, element_with_prefix = prefix_and_node
else:
prefix, element_with_prefix = None, None
# Then check the children
for child in xmp.rdf_root.childNodes:
if prefix_and_node := lookup_prefix_for_ns(child, self.namespace_uri):
prefix, element_with_prefix = prefix_and_node
break
if prefix and element_with_prefix:
# An element with this namespace is registered, create the element.
element = xmp.packet.createElementNS(self.namespace_uri, f"{prefix}:{self.local_name}")
else:
# No prefix for namespace in document, register it in parent.
prefix_by_ns = {prefix: ns for ns, prefix in namespaces.items()}
xmp.rdf_root.setAttribute(
f"xmlns:{prefix_by_ns[self.namespace_uri]}", self.namespace_uri
)
element = xmp.packet.createElementNS(
self.namespace_uri, prefix_by_ns[self.namespace_uri] + ":" + self.local_name
)
element_with_prefix = xmp.rdf_root
# Insert the new element
element.childNodes[:] = nodes
element_with_prefix.appendChild(element)
self._set_xml_property_cache(xmp, element)
xmp.stream.modify(xmp.packet.toprettyxml().encode())
def _get_rdf_prefix(self, xmp: XmpMetadata) -> str:
"""Gets the prefix used for the RDF namespace by this XMP packet, creating
one if no prefix exists."""
prefix_and_node = lookup_prefix_for_ns(xmp.rdf_root, namespaces["rdf"])
if prefix_and_node is not None:
prefix, _ = prefix_and_node
else:
xmp.rdf_root.setAttribute("xmlns:rdf", namespaces["rdf"])
prefix = "rdf"
return prefix
def _delete_xml_property(self, xmp: XmpMetadata) -> None:
"""Deletes the current XML property element from the packet."""
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
if xml_property is None:
return
parent = xml_property.parentNode
if parent is None:
raise PdfParseError("cannot remove XMP property because it has no parent")
removed = parent.removeChild(xml_property)
if (owner := removed.ownerDocument) is not None:
xmp.packet = owner
removed.unlink()
else:
raise PdfParseError("could not set property because XMP document is null")
self._set_xml_property_cache(xmp, None)
xmp.stream.modify(xmp.packet.toprettyxml().encode())
[docs]
class XMPTextProperty(XMPProperty):
"""An XMP Text property -- a possibly empty Unicode string."""
def __get__(self, xmp: XmpMetadata, objtype: Any | None = None) -> str | None:
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
return get_full_text(xml_property) if xml_property else None
def __set__(self, xmp: XmpMetadata, value: str | None) -> None:
self._fetch_xml_property(xmp)
if value is None:
self._delete_xml_property(xmp)
return
text_node = xmp.packet.createTextNode(value)
self._set_xml_property(xmp, [text_node])
def __delete__(self, xmp: XmpMetadata) -> None:
self._delete_xml_property(xmp)
[docs]
class XMPLangAltProperty(XMPProperty):
"""An XMP Language Alternative property -- an alternative array of simple text items
facilitating the selection of a text item based on a desired language.
In this case, this array is represented as a mapping of language names to text items
corresponding to each language. The language name should be a value as defined in RFC
3066, composed of a primary language subtag and an optional series of subsequent subtags.
The default value, if known, should be the first item in the dictionary. A default
value may also be explicitly marked by setting its language to 'x-default'.
See https://developer.adobe.com/xmp/docs/XMPNamespaces/XMPDataTypes/#language-alternative.
"""
def __get__(self, xmp: XmpMetadata, objtype: Any | None) -> dict[str, str] | None:
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
if xml_property is None:
return
alt = xml_property.getElementsByTagNameNS(namespaces["rdf"], "Alt")
if not alt:
return
langalt = {}
for element in alt[0].getElementsByTagNameNS(namespaces["rdf"], "li"):
langalt[element.attributes["xml:lang"].value] = get_full_text(element)
return langalt
def __set__(self, xmp: XmpMetadata, value: dict[str, str] | None) -> None:
self._fetch_xml_property(xmp)
if value is None:
self._delete_xml_property(xmp)
return
prefix = self._get_rdf_prefix(xmp)
alt: minidom.Element = xmp.packet.createElementNS(namespaces["rdf"], f"{prefix}:Alt")
for lang, val in value.items():
list_item: minidom.Element = xmp.packet.createElementNS(
namespaces["rdf"], f"{prefix}:li"
)
list_item.setAttribute("xml:lang", lang)
list_item.childNodes.append(xmp.packet.createTextNode(val))
alt.appendChild(list_item)
self._set_xml_property(xmp, [alt])
def __delete__(self, xmp: XmpMetadata) -> None:
self._delete_xml_property(xmp)
[docs]
class XMPListProperty(XMPProperty): # list being either a sequence or bag
"""An array valued XMP property -- in this context, either an RDF sequence, used
for ordered arrays, or an RDF bag, used for unordered arrays.
See § 7.7 "Array valued XMP properties" in Part 1 of the XMP specification.
"""
def __get__(self, xmp: XmpMetadata, objtype: Any | None) -> list[str] | None:
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
if xml_property is None:
return
containers = xml_property.getElementsByTagNameNS(namespaces["rdf"], self.extra["kind"])
if not containers:
return
items = []
for element in containers[0].getElementsByTagNameNS(namespaces["rdf"], "li"):
items.append(get_full_text(element))
return items
def __set__(self, xmp: XmpMetadata, value: list[str] | None) -> None:
self._fetch_xml_property(xmp)
if value is None:
self._delete_xml_property(xmp)
return
prefix = self._get_rdf_prefix(xmp)
kind = self.extra["kind"]
container: minidom.Element = xmp.packet.createElementNS(
namespaces["rdf"], f"{prefix}:{kind}"
)
for item in value:
list_item: minidom.Element = xmp.packet.createElementNS(
namespaces["rdf"], f"{prefix}:li"
)
list_item.childNodes.append(xmp.packet.createTextNode(item))
container.appendChild(list_item)
self._set_xml_property(xmp, [container])
def __delete__(self, xmp: XmpMetadata) -> None:
self._delete_xml_property(xmp)
[docs]
class XMPDateProperty(XMPProperty):
"""An XMP Date property -- an ISO 8601 date string, or specifically, the subset
specified in https://www.w3.org/TR/NOTE-datetime.
See https://developer.adobe.com/xmp/docs/XMPNamespaces/XMPDataTypes/#date.
"""
def __get__(self, xmp: XmpMetadata, objtype: Any | None = None) -> datetime.datetime | None:
self._fetch_xml_property(xmp)
xml_property = self._get_xml_property(xmp)
if xml_property is None:
return
text = get_full_text(xml_property)
return parse_iso8601(text)
def __set__(self, xmp: XmpMetadata, value: datetime.datetime | None) -> None:
self._fetch_xml_property(xmp)
if value is None:
self._delete_xml_property(xmp)
return
text_node = xmp.packet.createTextNode(encode_iso8601(value))
self._set_xml_property(xmp, [text_node])
def __delete__(self, xmp: XmpMetadata) -> None:
self._delete_xml_property(xmp)