Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import warnings
import weakref
import zipfile
from operator import itemgetter

from . import extra

Expand Down Expand Up @@ -2921,6 +2922,8 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
self.is_encrypted = False
self.is_encrypted = False
self.metadata = None
self.has_duplicate_images = False
self.images_xrefs_by_page = None
self.FontInfos = []
self.Graftmaps = {}
self.ShownPages = {}
Expand Down Expand Up @@ -3045,6 +3048,27 @@ def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0
self.page_count2 = extra.page_count_pdf
else:
self.page_count2 = extra.page_count_fz

if len(self.page_count) > 1:
has_duplicate_images = True
first_page_n_images = len(self.get_page_images(0))
for page in self.pages(start=1):
# we need at least one page with a different number of images
# to exclude full document duplication
if len(page.get_images()) != first_page_n_images:
has_duplicate_images = False
break
self.has_duplicate_images = has_duplicate_images

if self.has_duplicate_images:
self.images_xrefs_by_page = []
for page in self.pages():
# store only images referenced by page
page_xrefs = list(map(
itemgetter("xref"),
page.get_image_info(xrefs=True)
))
self.images_xrefs_by_page = page_xrefs
finally:
JM_mupdf_show_errors = JM_mupdf_show_errors_old

Expand Down Expand Up @@ -5076,7 +5100,14 @@ def get_page_images(self, pno: int, full: bool =False) -> list:
return ()
val = self._getPageInfo(pno, 2)
if not full:
return [v[:-1] for v in val]
val = [v[:-1] for v in val]
if self.has_duplicate_images:
deduplicated_val = []
for v in val:
# v[0] is "xref"
if v[0] in self.images_xrefs_by_page[pno]:
deduplicated_val.append(v)
return deduplicated_val
return val

def get_page_labels(self):
Expand Down
Loading