#!/usr/bin/python """Rearrange pages of a PDF document. Each page of the output document can contain several pages of the input document, possibly rotated and scaled. With the --reformat option, the margins of the original pages are cut off, so that more space of the output pages can be used for content. """ import argparse import logging import math import re import string import sys import gi from gi.repository import Gio gi.require_version('Poppler', '0.18') from gi.repository import Poppler import cairo __author__ = "Emmanuel Beffara" __email__ = "manu@beffara.org" __version__ = "2020.03.21" # Pythonic tools {{{1 class Extender (object): """ Hack that enables to extend objects of native types when inheriting classes cannot be used, e.g. for objects returned by functions. """ def __init__ (self, base_object): self._base_object = base_object def __getattr__ (self, name): return getattr(self._base_object, name) def memoizing (function): """Cache the results of a method. This decorator caches the results of a method self.foo by storing computed values into an attribute self._foo_cache. The arguments of the function must be hashable. """ cache_name = '_%s_cache' % function.__name__ def method (self, *args): try: return getattr(self, cache_name)[args] except AttributeError: value = function(self, *args) setattr(self, cache_name, { args : value }) return value except KeyError: value = function(self, *args) getattr(self, cache_name)[args] = value return value return method # Documents and pages {{{1 class Document (Extender): """ A wrapper around Poppler.Document with a extra methods for bounding box computation. """ def __init__ (self, filename): uri = Gio.File.new_for_commandline_arg(filename).get_uri() Extender.__init__(self, Poppler.Document.new_from_file(uri, None)) self.page_cache = {} @memoizing def get_page (self, number): return Page(self, number) def pages (self): """Return an iterator over the pages of the document.""" for page_number in range(0, self.get_n_pages()): yield self.get_page(page_number) def max_crop_box (self): """Get a maximal crop box for the document.""" box = None for page in self.pages(): if box is None: box = page.get_crop_box() else: box.max_with(page.get_crop_box()) return box class Page (Extender): """ A wrapper around Poppler.Page with a extra methods for bounding box computation. """ def __init__ (self, document, number): self.number = number Extender.__init__(self, document._base_object.get_page(number)) @property @memoizing def crop_box (self): """The page size.""" rect = self._base_object.get_crop_box() return Rectangle(rect.x1, rect.y1, rect.x2, rect.y2) def get_image (self): """Get an image of the page, with 1 pixel for 1 point.""" r = self.crop_box image = A8Image(int(r.x2), int(r.y2)) cr = cairo.Context(image.get_surface()) self.render(cr) image.flush() return image @property @memoizing def bbox (self): """The bounding box of the contents of the page.""" box = self.get_image().get_bbox() logging.debug('bbox for page %d: %r' % (self.number, box)) return box class A8Image (Extender): """ A thin wrapper around Cairo's image surfaces that adds bounding box computation. Only 8-bit alpha format is available here. """ def __init__ (self, width, height): Extender.__init__(self, cairo.ImageSurface(cairo.FORMAT_A8, width, height)) def get_surface (self): """Get the Cairo surface associated to the image.""" return self._base_object def get_bbox_native (self): """Compute the bounding box of the contents of the surface. In this method, all computation is done in Python, which is not very efficient. """ buf = self.get_data() x1 = y1 = x2 = y2 = None pos = 0 stride = self.get_stride() for y in range(0, self.get_height()): for x in range(0, self.get_width()): if buf[pos + x] == '\0': continue if y1 is None: y1 = y y2 = y + 1 if x1 is None or x < x1: x1 = x if x2 is None or x > x2: x2 = x + 1 pos += stride if x1 is None: return None return Rectangle(x1,y1,x2,y2) def get_bbox_pil (self): """Compute the bounding box of the contents of the surface. This method uses the computation method from the Python Imaging Library, it is much more efficient that the pure Python one. """ import PIL.Image image = PIL.Image.frombuffer('L', (self.get_width(), self.get_height()), self.get_data(), 'raw', 'L', self.get_stride(), 1) bbox = image.getbbox() if bbox is None: return None return Rectangle(*bbox) # Choose the most efficient version for get_bbox. try: import PIL.Image get_bbox = get_bbox_pil except ImportError: get_bbox = get_bbox_native # Basic geometry {{{1 class Rectangle: """ A rectangle, defined by its extremal coordinates. """ def __init__ (self, x1, y1, x2, y2): self.x1 = x1 self.y1 = y1 self.x2 = x2 self.y2 = y2 def __repr__ (self): return '(%g,%g,%g,%g)' % (self.x1, self.y1, self.x2, self.y2) @property def width (self): return self.x2 - self.x1 @property def height (self): return self.y2 - self.y1 def max_with (self, other): """Compute the bounding box of this rectangle and another one. The object is modified in place, no value is returned. """ self.x1 = min(self.x1, other.x1) self.y1 = min(self.y1, other.y1) self.x2 = max(self.x2, other.x2) self.y2 = max(self.y2, other.y2) def scaled (self, scale): """Return a new rectangle obtained by scaling this one.""" return Rectangle( self.x1 * scale, self.y1 * scale, self.x2 * scale, self.y2 * scale) def shifted (self, dx, dy): """Return a new Rectangle obtained by shifting this one.""" return Rectangle(self.x1 + dx, self.y1 + dy, self.x2 + dx, self.y2 + dy) @staticmethod def bbox (points): """Make a rectangle as the bounding box of a set of points.""" x1 = y1 = x2 = y2 = None for x, y in points: if x1 is None: x1 = x2 = x y1 = y2 = y else: x1 = min(x1, x) y1 = min(y1, y) x2 = max(x2, x) y2 = max(y2, y) if x1 is None: return None return Rectangle(x1, y1, x2, y2) def rotate_bbox (self, angle): """Return the bounding box of this rectangle rotated. The angle is given in radians. """ return Rectangle.bbox([rotate((x, y), angle) for x in (self.x1, self.x2) for y in (self.y1, self.y2)]) def rotate (point, angle): x, y = point return ( math.cos(angle) * x - math.sin(angle) * y, math.sin(angle) * x + math.cos(angle) * y) # Layouts {{{1 class Layout: """ This represents a way of rearranging the pages of a document. """ def __init__ (self, document, opts): """Create a layout using options from the 'opts' object.""" self.document = document self.opts = opts @property @memoizing def chunk_list (self): """The list of page chunks used for layout. Each chunk is a list of input page numbers, in their order in the output document. Chunks may contain occurrences of None to indicate blank pages inserted in the layout. """ pages = ( [None] * self.opts.pad_before + list(self.opts.select.iterate(self.document.get_n_pages())) + [None] * self.opts.pad_after ) logging.debug('logical pages: %r' % pages) if self.opts.chunks is None: chunks = [pages] else: chunks = [] chunk = [] index = 0 for number in pages: chunk.append(number) index += 1 if index == self.opts.chunks: chunks.append(chunk) chunk = [] index = 0 if index > 0: if self.opts.pad_last: chunk.extend([None] * (self.opts.chunks - index)) chunks.append(chunk) if self.opts.folding: chunks = map(self.reorder_for_folding, chunks) logging.debug('page chunks: %r' % chunks) return chunks @property @memoizing def page_numbers (self): """The set of pages actually considered in the source document.""" parts = [] for chunk in self.chunk_list: parts.append(set([x for x in chunk if x is not None])) return set.union(*parts) @staticmethod def reorder_for_folding (iterable): items = list(iterable) length = len(items) # Pad to a multiple of 4 if length % 4 != 0: pad = 4 - (length % 4) items.extend([None]*pad) length += pad # Enumerate pages in order reordered = [] for group in range(0, length/4): reordered.extend([ items[length - 2*group - 1], items[2*group], items[2*group + 1], items[length - 2*group - 2] ]) return reordered @memoizing def source_crop (self, number): """The table of crop boxes (i.e. page sizes) of input pages.""" return self.document.get_page(number).crop_box @memoizing def source_bbox (self, number): """The table of actual bounding boxes of input pages.""" return self.document.get_page(number).bbox @memoizing def page_angle (self, number): """The actual rotation applied to a given page.""" angle = self.opts.angle for (PageList, extra_angle) in self.opts.rotate_pages: if number in PageList: angle += extra_angle return angle @property @memoizing def layout (self): """The complete layout, as a list of output page descriptions. Each page description is a list of tuples with the following items: - input page number (not None) - scaling - angle (in radians, counterclockwise relative to the origin) - horizontal shift (in points) - vertical shift (in points) """ s = self.opts page_width = s.size[0] page_height = s.size[1] cols = s.cols rows = s.rows margin = s.margin halign = s.align[0] valign = s.align[1] scale = s.scale # Compute the positions for output pages width = (page_width - margin) / cols - margin height = (page_height - margin) / rows - margin positions = tuple( (margin + x*(width + margin), margin + y*(height + margin)) for y in range(0, rows) for x in range(0, cols) ) modulo = len(positions) logging.debug('computed layout: %r', positions) # Select the useful part of each page if s.reformat: boxes = self.source_bbox else: boxes = self.source_crop # Compute the scaling factor if scale is None: scales = [] for number in self.page_numbers: box = boxes(number) if box is not None: box = box \ .shifted(-box.x1,-box.y1) \ .rotate_bbox(self.page_angle(number)) scales.append(min(width/box.width, height/box.height)) scale = min(scales) logging.info('scale: %g', scale) # Deduce the transformation for each page layout = [] for chunk in self.chunk_list: index = 0 current_page = [] for number in chunk: if number is not None and boxes(number) is not None: angle = self.page_angle(number) box = boxes(number).scaled(scale).rotate_bbox(angle) px, py = positions[index] px += halign * (width - box.width) - box.x1 py += valign * (height -box.height) - box.y1 current_page.append((number, scale, angle, px, py)) index = (index + 1) % modulo if index == 0: layout.append(current_page) current_page = [] if index != 0: layout.append(current_page) logging.debug('computed layout: %r' % layout) return layout def render (self, cr): """Render the layout on a Cairo context.""" for out_page in self.layout: for page, scale, angle, dx, dy in out_page: cr.identity_matrix() cr.translate(dx, dy) cr.scale(scale, scale) cr.rotate(angle) Page(self.document, page).render(cr) cr.show_page() # Command-line interface {{{1 # Constrained integers {{{2 def positive_int (text): """Convert a string to a strictly positive integer. Raise ArgumentTypeError for any invalid input. """ value = int(text) if value <= 0: raise argparse.ArgumentTypeError("strictly positive integer expected") return value def non_negative_int (text): """Convert a string to a non-negative integer. Raise ArgumentTypeError for any invalid input. """ value = int(text) if value < 0: raise argparse.ArgumentTypeError("non-negative integer expected") return value # Numbers with units {{{2 def parse_float_with_unit (text, units): """Turn a string into a float, using a unit in the string. The argument 'units' maps unit names to conversion factors. A default unit can be specified by including the empty string as a unit name. Unit names are case sensitive. """ text = text.strip() m = re.match(r'[+-]?(\d+[.]?\d*|\d*[.]\d+)([eE][+-]?\d+)?', text) if m is None: raise argparse.ArgumentTypeError("invalid format for float with unit") number = float(m.group(0)) try: number *= units[text[m.end(0):].strip()] except KeyError: raise argparse.ArgumentTypeError("unknown unit of measure") return number def parse_tuple (text, value_parser, separator=',', min_size=0, max_size=None, named_values={}): """Parse a tuple of values of a given type. The argument 'value_parser' is used to parse individual components. A minimum and maximum size for the tuple can be specified. A set of named values can be used, then the input text is first matched against these values, and if no value is found then parsing occurs. """ text = text.strip() if text in named_values: return named_values[text] parts = text.split(separator) if len(parts) < min_size or (max_size is not None and len(parts) > max_size): raise argparse.ArgumentTypeError("wrong number of components") return tuple(value_parser(part.strip()) for part in parts) def curry_first (name, function, *args, **kwargs): def curried (arg1): return function(arg1, *args, **kwargs) curried.__name__ = name return curried ratio = curry_first('ratio', parse_float_with_unit, { '': 1., '%': 0.01, }) angle = curry_first('angle', parse_float_with_unit, { '': math.pi / 180., 'deg': math.pi / 180., 'rad': 1., }) dimension = curry_first('dimension', parse_float_with_unit, { 'pt': 1., 'in': 72., 'cm': 72. / 2.54, 'mm': 72. / 25.4, }) pagesize = curry_first( 'pagesize', parse_tuple, dimension, min_size = 2, max_size = 2, named_values = { 'a0': (dimension( '841 mm'), dimension('1189 mm')), 'a1': (dimension( '594 mm'), dimension( '841 mm')), 'a2': (dimension( '420 mm'), dimension( '594 mm')), 'a3': (dimension( '297 mm'), dimension( '420 mm')), 'a4': (dimension( '210 mm'), dimension( '297 mm')), 'a5': (dimension( '148 mm'), dimension( '210 mm')), 'a6': (dimension( '105 mm'), dimension( '149 mm')), 'a7': (dimension( '74 mm'), dimension( '105 mm')), 'a8': (dimension( '52 mm'), dimension( '74 mm')), 'a9': (dimension( '37 mm'), dimension( '52 mm')), 'a10': (dimension( '26 mm'), dimension( '37 mm')), 'b0': (dimension('1000 mm'), dimension('1414 mm')), 'b1': (dimension( '707 mm'), dimension('1000 mm')), 'b2': (dimension( '500 mm'), dimension( '707 mm')), 'b3': (dimension( '353 mm'), dimension( '500 mm')), 'b4': (dimension( '250 mm'), dimension( '353 mm')), 'b5': (dimension( '176 mm'), dimension( '250 mm')), 'b6': (dimension( '125 mm'), dimension( '176 mm')), 'b7': (dimension( '88 mm'), dimension( '125 mm')), 'b8': (dimension( '62 mm'), dimension( '88 mm')), 'b9': (dimension( '44 mm'), dimension( '62 mm')), 'b10': (dimension( '31 mm'), dimension( '44 mm')), 'letter': (dimension( '8.5 in'), dimension('11 in')), 'note': (dimension( '8.5 in'), dimension('11 in')), 'legal': (dimension( '8.5 in'), dimension('14 in')), 'executive': (dimension( '7.25 in'), dimension('10.5 in')), 'halfletter': (dimension( '5.5 in'), dimension( '8.5 in')), 'halfexecutive': (dimension( '5.25 in'), dimension( '7.25 in')), '11x17': (dimension('11 in'), dimension('17 in')), 'statement': (dimension( '5.5 in'), dimension( '8.5 in')), 'folio': (dimension( '8.5 in'), dimension('13 in')), '10x14': (dimension('10 in'), dimension('14 in')), 'ledger': (dimension('17 in'), dimension('11 in')), 'tabloid': (dimension('11 in'), dimension('17 in')), }) alignment = curry_first( 'alignment', parse_tuple, ratio, min_size = 2, max_size = 2, named_values = { 'center' : (0.5, 0.5), 'top left': (0., 0.), 'top' : (0.5, 0.), 'top right' : (1., 0.), 'left': (0., 0.5), 'center' : (0.5, 0.5), 'right' : (1., 0.5), 'bottom left': (0., 1.), 'bottom' : (0.5, 1.), 'bottom right' : (1., 1.), }) int_pair = curry_first( 'int_pair', parse_tuple, int, min_size = 2, max_size = 2) # Page lists {{{2 class PageList: """ Represents list of ranges of natural numbers, used for lists of page numbers. Caveat: in the input syntax, pages are numbered starting at 1 because the human convention is to do that. In the iterators and membership tests, pages are numbered from 0, because it is the way the document APIs work. In other words, the actual ranges are shifted down by 1 with respect to the syntax parsed by the __init__ method. """ def __init__ (self, text): """Parse a list of ranges using the usual commas and dashes syntax. The precise syntax is as follows: list ::= item ( "," item )* item ::= [ number ] [ "-" [ number ] ] An item consisting of a single number represents a single number, an item consisting of two numbers a and b separated by a dash represents the interval from a to b inclusive, omitting any of the bound represents an interval ubounded on one side. In particular, "-" represents the whole set of natural numbers. The syntax uses page numbers (the first one has number 1) instead of page indices (the first page has index 0), according to human convention. """ number = re.compile(r'\d+') text = text.strip() self.ranges = [] while len(text) > 0: m = number.match(text) if m: first = int(m.group(0)) text = text[m.end():].lstrip() else: first = None if text[:1] == '-': text = text[1:].lstrip() m = number.match(text) if m: last = int(m.group(0)) text = text[m.end():].lstrip() else: last = None self.ranges.append((first, last)) elif first is not None: self.ranges.append((first, first)) if len(text) == 0: break if text[0] != ',': raise argparse.ArgumentTypeError("unexpected character in page list: %r" % text[0]) text = text[1:].lstrip() def iterate (self, number): """Iterate over page indices, assuming a given maximum number.""" for first, last in self.ranges: if first is None: first = 1 if last is None: last = number for i in range(first - 1, last): yield i def __contains__ (self, number): """Test if a page index is in the set.""" for first, last in self.ranges: if first is not None and number < first - 1: continue if last is not None and number >= last: continue return True return False # }}}2 def page_rotation (text): parts = text.split(':') if len(parts) != 2: raise argparse.ArgumentTypeError("wrong syntax") return (PageList(parts[0]), angle(parts[1])) command_line_parser = argparse.ArgumentParser( description=__doc__) command_line_parser.add_argument('--version', action='version', version='%(prog)s ' + __version__, help="show version information and exit") command_line_parser.add_argument('source', help="source PDF document") command_line_parser.add_argument('--output', '-o', default='out.pdf', help="output file name (default: out.pdf)") command_line_parser.add_argument('--log-level', metavar='LEVEL', choices=('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'), default='WARNING', help="set the logging level") layout = command_line_parser.add_argument_group(title="layout options") layout.add_argument('--size', type=pagesize, default=pagesize('a4'), help="page size for output (default: a4)") layout.add_argument('--landscape', action='store_true', help="use landscape orientation for output") layout.add_argument('--portrait', action='store_false', dest='landscape', help="use portrait orientation for output") layout.add_argument('--reformat', action='store_true', help="reformat the input document (default: no)") layout.add_argument('--scale', type=ratio, default=None, help="the scaling factor when reformatting (default: auto)") layout.add_argument('--angle', type=angle, default=0, help="the angle by which pages are rotated (default: 0)") layout.add_argument('--align', type=alignment, default=alignment('center'), help="page alignment when reformatting (default: center)") layout.add_argument('--margin', type=dimension, default=None, help="extra margin in the output document (default: 1cm when reformatting, 0 otherwise)") layout.add_argument('--rows', type=positive_int, default=1, metavar="N", help="put N rows of input pages per output page") layout.add_argument('--cols', type=positive_int, default=1, metavar="N", help="put N columns of input pages per output page") layout.add_argument('--rotate-pages', type=page_rotation, action='append', default=[], metavar="PAGES:ANGLE", help="apply an extra rotation of ANGLE to some PAGES") ordering = command_line_parser.add_argument_group(title="page selection and ordering") ordering.add_argument('--select', type=PageList, default=PageList('-'), metavar="PAGES", help="select a subset of pages in the document") ordering.add_argument('--pad-before', type=non_negative_int, default=0, metavar="N", help="insert N blank pages before the first one") ordering.add_argument('--pad-after', type=non_negative_int, default=0, metavar="N", help="insert N blank pages after the last one") ordering.add_argument('--folding', action='store_true', help="reorder pages for folding") ordering.add_argument('--chunks', type=positive_int, metavar="N", help="treat the input document by chunks of N pages") ordering.add_argument('--pad-last', action='store_true', help="pad the last chunk to the required number of pages") presets = command_line_parser.add_argument_group(title="predefined settings") class NupAction (argparse.Action): """ This command-line action sets default options for N-up printing. The number N of input pages per output page must be a power of two. """ def __call__ (self, parser, opts, number, option=None): if number <= 0: raise argparse.ArgumentError(self, "a positive integer is required") exp = 0 while number & 1 == 0: exp += 1 number >>= 1 if number != 1: raise argparse.ArgumentError(self, "only powers of 2 are supported") side = 1 << (exp >> 1) opts.cols = side if exp & 1 == 0: opts.rows = side else: opts.rows = side + 1 opts.angle = math.pi / 2 opts.landscape = True presets.add_argument('--nup', action=NupAction, type=int, metavar='N', help="print N input pages per output page (powers of 2 only)") class BookletAction (argparse.Action): """ This command-line action sets default options for booklet printing. This includes a default layout of two input pages per output page plus reordering of pages for folding. """ def __call__ (self, parser, opts, number, option=None): opts.cols = 1 opts.rows = 2 opts.angle = math.pi / 2 opts.landscape = True opts.folding = True presets.add_argument('--booklet', action=BookletAction, nargs=0, help="rearrange for duplex printing of a foldable booklet") def parse_options (*args, **kwargs): opts = command_line_parser.parse_args() logging.basicConfig(level=getattr(logging, opts.log_level)) if opts.margin is None: if opts.reformat: opts.margin = dimension('1cm') else: opts.margin = 0. if opts.landscape: opts.size = tuple(reversed(opts.size)) opts.cols, opts.rows = opts.rows, opts.cols opts.angle -= math.pi/2 return opts # }}}1 if __name__ == '__main__': opts = parse_options() logging.debug('options: %r' % opts) doc = Document(opts.source) layout = Layout(doc, opts) surface = cairo.PDFSurface(opts.output, opts.size[0], opts.size[1]) cr = cairo.Context(surface) layout.render(cr) surface.flush()