Skip to content
  • mirabilos's avatar
    strip PDF page groups from checked-in PDFs · a2bf5133
    mirabilos authored and mirabilos's avatar mirabilos committed
    see commit 636879066fa0e31f129ca3c1054953a07633041c in verein.git
    for the rationale; the method used was…
    
    $ git find -gitfiles \*.pdf -print0 | xargs -0r mksh ~/pdfstrippagegroup.ksh
    
    … with the following script:
    $ cat ~/pdfstrippagegroup.ksh
    #!/bin/mksh
    
    unset LANGUAGE
    export LC_ALL=C
    
    die() {
    	print -ru2 -- "E: ${0##*/}: $dst: $*"
    	exit 1
    }
    
    for dst in "$@"; do
    	tpf=$dst\~.tmp
    	tq1=$tpf.1.qdf
    	tq2=$tpf.2.qdf
    	tq3=$tpf.3.qdf
    	tmp=$tpf.pdf
    	qpdf --stream-data=uncompress --normalize-content=n --qdf \
    	    "$dst" "$tq1" || die 'qpdf error converting to QDF'
    	# remove PDF page groups (required)
    	# also remove Producer info (size improvement)
    	perl -e '
    		use strict;
    		use warnings;
    
    		my $s = "0";
    		my $ispage = 0;
    		my $gt = "<ERR>";
    		while (my $line = <>) {
    			my $skip = 0;
    			chomp(my $l = $line);
    			if ($l eq "<<") {
    				$ispage = 0;
    				$s = 1 if $s eq 0;
    			} elsif ($l eq ">>") {
    				if ($s eq 3) {
    					# only omit page groups
    					print $gt unless $ispage;
    				}
    				$s = 0;
    			} elsif ($l eq "  /Type /Page") {
    				if (($s == 1) || ($s == 3)) {
    					$ispage = 1;
    				}
    			} elsif (($s == 1) && ($l eq "  /Group <<")) {
    				$gt = $line;
    				$s = 2;
    				$skip = 1;
    			} elsif (($s == 1) && ($l =~ qr`^  /Group `)) {
    				$gt = $line;
    				$s = 3;
    				$skip = 1;
    			} elsif ($s == 2) {
    				$gt .= $line;
    				$s = 3 if $l eq "  >>";
    				$skip = 1;
    			} elsif ($l eq "trailer <<") {
    				$s = 4;
    			} elsif ($s == 4) {
    				# size optimisation hack
    				# remove CreationDate, Producer, etc.
    				$skip = 1 if $l =~ qr`^  /Info `;
    			}
    			print $line unless $skip;
    		}
    	' <"$tq1" >"$tq2" || die 'error during perl QDF filtering'
    	fix-qdf <"$tq2" >"$tq3" || die 'error during fix-qdf'
    	# bullseye first then buster which lacks options
    	if ! (set -x; qpdf --stream-data=compress --recompress-flate \
    	    --compression-level=9 --normalize-content=n \
    	    --object-streams=disable --deterministic-id \
    	    "$tq3" "$tmp") 2>"$tq1" && ! (set -x; qpdf \
    	    --stream-data=compress --normalize-content=n \
    	    --object-streams=disable --deterministic-id \
    	    "$tq3" "$tmp") 2>>"$tq1"; then
    		cat "$tq1" >&2
    		die 'qpdf error converting from QDF'
    	fi
    	mv "$tmp" "$dst" || die 'could not create target file'
    	rm "$tpf"* || die 'error cleaning up tmp QDF files'
    	print -ru2 -- "I: fixed up $dst"
    done
    print -ru2 -- "I: done"
    a2bf5133