splith.py (6139B)
1 #!/usr/bin/python3 -O 2 3 import imageio 4 import collections 5 import sys 6 import numpy 7 import argparse 8 import os.path 9 from math import ceil, floor 10 11 parser = argparse.ArgumentParser( 12 description="Split a grayscale PNG image into horizontal strips") 13 parser.add_argument("filename", 14 help="input PNG file name", type=str) 15 parser.add_argument("output_folder", 16 help="folder to write output files", type=str) 17 parser.add_argument("--maxheight", 18 help="maximum height of a split", 19 type=int, default=10000) 20 parser.add_argument("--minheight", 21 help="minimum height of a split", 22 type=int, default=50) 23 parser.add_argument("--mincontentheight", 24 help="minimum height of a split", 25 type=int, default=5) 26 parser.add_argument("--distthreshold", 27 help="maximum height difference across splits", 28 type=int, default=300) 29 parser.add_argument("--whitethreshold", 30 help="threshold to detect white space", 31 type=float, default=1) 32 parser.add_argument("--debug", 33 help="write debug image", 34 action='store_true') 35 args = parser.parse_args() 36 37 img = imageio.imread(args.filename) 38 39 # https://stackoverflow.com/a/38549260 40 if hasattr(type(img[0][0]), '__iter__'): 41 print ("converting input image to grayscale") 42 # https://stackoverflow.com/a/51571053 43 img = numpy.dot(img[... , :3] , [0.299 , 0.587, 0.114]) 44 45 in_h = len(img) 46 in_w = len(img[0]) 47 48 # Step 1: find "cut lines" with minimal sum up to --whitethreshold 49 # while respecting --maxheight 50 51 # ensure the respect of maxheight 52 covered = [False] * in_h 53 ncovered = 0 54 55 # table of the cut lines 56 cut = [False]*in_h 57 58 # compute sums 59 row_cumul = [255*in_w - numpy.sum(img[j]) for j in range(in_h)] 60 61 # sort potential cut lines by score 62 cut_candidates = [(row_cumul[i], i) for i in range(in_h)] 63 cut_candidates = sorted(cut_candidates) 64 mn_score = cut_candidates[0][0] 65 66 # consider all potential cut lines in sorted order 67 for i in range(len(cut_candidates)): 68 (score, pos) = cut_candidates[i] 69 if score > mn_score + args.whitethreshold*in_w and ncovered == in_h: 70 break # over threshold, and enough cuts to respect maxheight 71 cut[pos] = True 72 # mark the covered regions (distance of --maxheight) 73 for j in range(max(0, pos-floor(1.*args.maxheight/2)), 74 min(in_h, pos+ceil(1.*args.maxheight/2))): 75 if not covered[j]: 76 ncovered += 1 77 covered[j] = True 78 79 # the first and last must be cuts 80 cut[-1] = True 81 cut[0] = True 82 83 # Step 2: remove content smaller than --mincontentheight by merging cuts 84 85 cut2 = list(cut) 86 87 last = 0 88 for r in range(in_h): 89 if cut[r]: 90 if r-last < args.mincontentheight: 91 # forget about the previous non-cut region, it is too small 92 for rr in range(last, r+1): 93 cut2[rr] = True 94 last = r 95 96 # Step 3: group contiguous cuts 97 98 contiguous = [] 99 100 last = 0 101 102 for r in range(in_h): 103 if not cut2[r]: 104 if last < r-1: 105 contiguous.append((r-last, last)) 106 last = r 107 108 if last < in_h-1: 109 contiguous.append((in_h-1-last, last)) 110 111 # Step 4: find potential cuts scored by the number of contiguous cut lines 112 # again respecting --maxheight 113 114 contiguous_sort = sorted(contiguous, reverse=True) 115 best = contiguous_sort[0][0] 116 117 covered2 = [False] * in_h 118 ncovered2 = 0 119 120 final_cuts = [] 121 122 for i in range(len(contiguous_sort)): 123 (height, pos) = contiguous_sort[i] 124 if height < best - args.distthreshold and ncovered2 == in_h: 125 break # we are under threshold and have enough cuts 126 final_cuts.append((pos, height)) 127 # mark the covered regions 128 for j in range(pos, max(0, pos-floor(1.*args.maxheight/2)), -1): 129 if not covered2[j]: 130 ncovered2 += 1 131 covered2[j] = True 132 else: 133 break 134 for j in range(pos, min(in_h, pos+height+ceil(1.*args.maxheight/2))): 135 if not covered2[j]: 136 ncovered2 += 1 137 covered2[j] = True 138 else: 139 break 140 141 final_cuts = sorted(final_cuts) 142 143 if args.debug: 144 matrix = numpy.full((in_h,in_w,3), 255, dtype=numpy.uint8) 145 146 for r in range(in_h): 147 for c in range(in_w): 148 matrix[r][c] = img[r][c] 149 if cut[r]: 150 matrix[r][c][1] = 255 151 matrix[r][c][2] = 0 152 elif cut2[r]: 153 matrix[r][c][1] = 0 154 matrix[r][c][2] = 255 155 for i in range(len(final_cuts)-1): 156 pcut = final_cuts[i] 157 ncut = final_cuts[i+1] 158 start = pcut[0] + pcut[1] 159 end = ncut[0] 160 for c in range(in_w): 161 if end-start >= args.minheight: 162 matrix[start][c][0] = 255 163 matrix[start][c][1] = 0 164 matrix[start][c][2] = 0 165 else: 166 matrix[start][c][0] = 0 167 matrix[start][c][1] = 255 168 matrix[start][c][2] = 0 169 170 outfname = os.path.join(args.output_folder, "debug.png") 171 172 imageio.imwrite(outfname, matrix) 173 sys.exit(0) 174 175 # Step 5: produce the output files 176 # just discards splits smaller than minheight 177 # also trims white space from left and right 178 179 num = 0 180 for i in range(len(final_cuts)-1): 181 pcut = final_cuts[i] 182 ncut = final_cuts[i+1] 183 start = pcut[0] + pcut[1] 184 end = ncut[0] 185 186 if end-start < args.minheight: 187 continue 188 189 l_end = 0 190 r_end = in_w 191 192 while (l_end <= r_end and 255*(end-start)-sum(img[x][l_end] for x in range(start, end)) < 193 (end-start)*args.whitethreshold): 194 l_end += 1 195 while (l_end <= r_end and 255*(end-start)-sum(img[x][r_end-1] for x in range(start, end)) < 196 (end-start)*args.whitethreshold): 197 r_end -= 1 198 199 if l_end == r_end: 200 continue 201 202 matrix = numpy.full((end-start,r_end-l_end), 255, dtype=numpy.uint8) 203 204 for r in range(start, end): 205 for c in range(l_end, r_end): 206 matrix[r-start][c-l_end] = img[r][c] 207 208 outfname = os.path.join(args.output_folder, os.path.basename(args.filename).split('.')[0] + "_" + "{:04d}".format(num) + ".png") 209 210 imageio.imwrite(outfname, matrix) 211 print("wrote %s" % outfname) 212 num += 1 213