diff --git a/src/__init__.py b/src/__init__.py index b06babea0..c3ecd9ae9 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -29,6 +29,7 @@ import warnings import weakref import zipfile +from operator import itemgetter from . import extra @@ -2921,6 +2922,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.is_encrypted = False self.is_encrypted = False self.metadata = None + self.has_duplicate_images = False + self.images_xrefs_by_page = None self.FontInfos = [] self.Graftmaps = {} self.ShownPages = {} @@ -3045,6 +3048,27 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0 self.page_count2 = extra.page_count_pdf else: self.page_count2 = extra.page_count_fz + + if len(self.page_count) > 1: + has_duplicate_images = True + first_page_n_images = len(self.get_page_images(0)) + for page in self.pages(start=1): + # we need at least one page with a different number of images + # to exclude full document duplication + if len(page.get_images()) != first_page_n_images: + has_duplicate_images = False + break + self.has_duplicate_images = has_duplicate_images + + if self.has_duplicate_images: + self.images_xrefs_by_page = [] + for page in self.pages(): + # store only images referenced by page + page_xrefs = list(map( + itemgetter("xref"), + page.get_image_info(xrefs=True) + )) + self.images_xrefs_by_page = page_xrefs finally: JM_mupdf_show_errors = JM_mupdf_show_errors_old @@ -5076,7 +5100,14 @@ def get_page_images(self, pno: int, full: bool =False) -> list: return () val = self._getPageInfo(pno, 2) if not full: - return [v[:-1] for v in val] + val = [v[:-1] for v in val] + if self.has_duplicate_images: + deduplicated_val = [] + for v in val: + # v[0] is "xref" + if v[0] in self.images_xrefs_by_page[pno]: + deduplicated_val.append(v) + return deduplicated_val return val def get_page_labels(self):