nlsplit.c (10375B)
1 /* nlsplit by a3nm (2011) */ 2 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <assert.h> 6 7 const char help[] = 8 "Split natural language text from stdin in pieces less than SIZE bytes.\n" 9 "If MINCONFIDENCE is set, always cut above this confidence threshold.\n"; 10 11 /* above this confidence value, we always split */ 12 float min_confidence = -1; 13 14 /* maximal size of pieces */ 15 int size; 16 17 /* for tab-space equivalence in indentation counts */ 18 #define TAB_LEN 2 19 20 /* score of the last split */ 21 #define EOF_SCORE 999999 22 /* score per \n, for multiple \n's */ 23 #define MULTIPLE_LINE_SCORE 1000 24 /* if file contains lines longer than this, then one \n is a safer split */ 25 #define LONG_LINES_THRESHOLD 100 26 /* if first word of line fits on previous line with this margin, safer split */ 27 #define DELTA_THRESHOLD 10 28 /* if indentation changes, safer split */ 29 #define INDENT_CHANGE_SCORE 18 30 /* punctuations where split is safe */ 31 #define FINAL_PUNCT_SCORE 20 32 /* punctuations where split is less safe */ 33 #define SEMIFINAL_PUNCT_SCORE 18 34 /* punctuations where split isn't really safe */ 35 #define OTHER_PUNCT_SCORE 10 36 /* whitespace and case can help punctuation splits up to this limit */ 37 #define MAX_PUNCT_SITUATION_SCORE 3 38 /* one \n with next line starting by non-lowercase is a safer split */ 39 #define NEWLINE_WITH_NON_LOWERCASE_SCORE 3 40 /* \n is a last-resort split */ 41 #define NEWLINE_SCORE 2 42 /* whitespace is a last-resort split */ 43 #define WHITESPACE_SCORE 1 44 /* non-alpha is a last-resort split */ 45 #define NON_ALPHA_SCORE 0.5 46 47 #define E_SYNTAX 1 48 #define E_MEMORY 2 49 50 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) 51 #define MIN(a, b) (((a) < (b)) ? (a) : (b)) 52 #define MID(l, r, s) ((((l) + ((r) + ((l) > (r) ? (s) : 0)))/2) % (s)) 53 54 /* a possible split point */ 55 typedef struct point { 56 float confidence; 57 long position; 58 } point; 59 60 void usage(char** argv) { 61 /* show usage and exit */ 62 fprintf(stderr, "Usage: %s SIZE [MINCONFIDENCE]\n", argv[0]); 63 fprintf(stderr, help); 64 exit(E_SYNTAX); 65 } 66 67 float punct_score(char c) { 68 /* get score of a punctuation */ 69 if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE; 70 if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE; 71 return OTHER_PUNCT_SCORE; 72 } 73 74 float get(point *points, int hd, int tl, long position) { 75 /* binary search of position in circular buffer (points, hd, tl) */ 76 /* return the position of the best candidate */ 77 78 if (hd == tl) return hd; 79 int m = MID(hd, tl, size); 80 if (points[m].position == position) 81 return m; 82 if (points[m].position > position) 83 return get(points, hd, m, position); 84 else 85 return get(points, (m+1) % size, tl, position); 86 } 87 88 int push2(point *points, int hd, int tl, float confidence, long position) { 89 /* insert (confidence, position) in circular buffer (points, hd, tl) */ 90 /* buffer is kept sorted, existing elements below confidence are erased) */ 91 /* insertion position is found by binary search on confidence */ 92 /* return new value for tl */ 93 94 assert(hd < size); 95 assert(tl < size); 96 if (hd == tl) { 97 /* insert at tl */ 98 points[tl].position = position; 99 points[tl].confidence = confidence; 100 /* increment otl */ 101 return (tl + 1) % size; 102 } 103 int m = MID(hd, tl, size); 104 /* in case of tie, drop elements */ 105 if (points[m].confidence <= confidence) 106 return push2(points, hd, m, confidence, position); 107 else 108 return push2(points, (m+1) % size, tl, confidence, position); 109 } 110 111 int push(point *points, int hd, int *tl, float confidence, long position, 112 long offset) { 113 /* insert (confidence, position) in circular buffer (points, hd, tl) */ 114 /* if position is already in buffer, insert sum of confidence values */ 115 /* perform update of tl, return new value of tl */ 116 /* buffer is sorted by decreasing confidence, and increasing position */ 117 /* except in rare cases where we insert at an old position */ 118 /* which could make us underestimate slightly confidence for some splits */ 119 120 assert(hd < size); 121 assert(*tl < size); 122 123 /* refuse insert of positions before offset */ 124 if (position <= offset) return *tl; 125 int old_pos = get(points, hd, *tl, position); 126 float old = points[old_pos].position == position ? 127 points[old_pos].confidence : 0; 128 return (*tl = push2(points, hd, *tl, confidence + old, position)); 129 } 130 131 int split() { 132 /* perform the split */ 133 134 /* counter of pieces */ 135 int npiece = 0; 136 137 /* circular buffer for the current piece */ 138 char *piece; 139 /* offset is the absolute offset relative to BOF */ 140 /* pos is the offset within the piece */ 141 int offset = 0, pos = 0; 142 143 /* circular sorted buffer for the current split points */ 144 /* this is kept sorted by decreasing confidence and increasing position */ 145 point *points; 146 /* head and tail of the buffer */ 147 int hd = 0, tl = 0; 148 149 /* current and last character */ 150 char c = 0, last = 0; 151 /* for eof detection */ 152 int c_int; 153 154 /* confidence for split at current position */ 155 float current = 0; 156 157 /* number of successive newlines, indentation of current and last line */ 158 int n_newlines = 0, indent = 0, last_indent = 0; 159 160 /* length of current and last line, maximal line length */ 161 int l_line = 0, l_last_line = 0, max_l_line = 0; 162 163 /* length of first word, number of words on line, uppercase in current word */ 164 int l_first_word = 0, n_words = 0, word_has_uppercase = 0; 165 166 /* last punctuation seen, number of whitespace characters after punctuation */ 167 int last_punct = 0, whitespace_after_punct = 0; 168 169 int i; 170 171 /* allocate memory */ 172 piece = malloc(size * sizeof(char)); 173 points = malloc(size * sizeof(point)); 174 if (!piece || !points) { 175 perror("malloc"); 176 return E_MEMORY; 177 } 178 179 /* read file, break after splitting at EOF */ 180 /* do not break when reading EOF, because we must output last chunk */ 181 while (1) { 182 /* read char */ 183 last = c; 184 c_int = getchar(); 185 c = c_int; 186 187 /* cut if we have to */ 188 assert(pos <= size); 189 if (c_int == EOF || pos == size || 190 (hd != tl && min_confidence > 0 && 191 points[hd].confidence >= min_confidence)) { 192 /* pop old entries */ 193 while (hd != tl && points[hd].position <= offset) 194 hd = (hd + 1) % size; 195 if (hd == tl) { 196 /* we have no points, we must create one */ 197 push(points, hd, &tl, 0, offset + pos, offset); 198 assert(points[hd].position > offset); 199 } 200 assert(points[hd].position > offset); 201 if (c_int == EOF) { 202 /* special EOF cut */ 203 points[hd].confidence = EOF_SCORE; 204 points[hd].position = offset + pos; 205 } 206 printf("-- chunk %d length %ld confidence %f\n", 207 npiece, points[hd].position - offset, points[hd].confidence); 208 /* output the data */ 209 for (i = offset; i < points[hd].position ; i++) 210 putchar(piece[i % size]); 211 putchar('\n'); 212 /* update offset and pos */ 213 pos = (offset + pos) - points[hd].position; 214 assert(pos < size); 215 offset = points[hd].position; 216 /* pop the point */ 217 hd = (hd + 1) % size; 218 /* increment piece counter */ 219 npiece++; 220 } 221 222 /* break if we must */ 223 if (c_int == EOF) break; 224 225 /* add char */ 226 piece[(offset + (pos++)) % size] = c; 227 228 /* produce split points */ 229 current = 0; 230 if (c == '\n' || c == ' ' || c == '\t') 231 if (last_punct) whitespace_after_punct++; 232 if (c == '\n') { 233 n_words = 0; 234 l_first_word = 0; 235 last_indent = indent; 236 indent = 0; 237 l_last_line = l_line; 238 l_line = 0; 239 max_l_line = MAX(max_l_line, l_line); 240 n_newlines++; 241 if (n_newlines > 1) { 242 current += MULTIPLE_LINE_SCORE * n_newlines; 243 } else { 244 if (max_l_line > LONG_LINES_THRESHOLD) 245 current += (max_l_line - LONG_LINES_THRESHOLD); 246 current += NEWLINE_SCORE; 247 } 248 } else { 249 n_newlines = 0; 250 l_line++; 251 if (c == ' ' || c == '\t') { 252 current += WHITESPACE_SCORE; 253 if (l_first_word == 0) { 254 /* we are still in indentation */ 255 indent += c == '\t' ? 2 : 1; 256 } else { 257 if (!n_words) { 258 /* we have just read the first word */ 259 int delta = max_l_line - l_last_line - l_first_word - 1; 260 if (delta > DELTA_THRESHOLD) { 261 /* first word of current line would fit on previous line */ 262 push(points, hd, &tl, delta, offset + pos - l_line - 1, offset); 263 } 264 } 265 n_words += 1; 266 } 267 } else { 268 if (last_punct) { 269 /* punctuation followed by much whitespace and an uppercase letter */ 270 /* and having a preceding word without uppercase is safer */ 271 int situation_score = MIN(MAX_PUNCT_SITUATION_SCORE, 272 whitespace_after_punct + 273 ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1)); 274 push(points, hd, &tl, situation_score + punct_score(last_punct), 275 offset + pos - whitespace_after_punct, offset); 276 whitespace_after_punct = 0; 277 last_punct = 0; 278 } 279 280 if (indent != last_indent) 281 push(points, hd, &tl, INDENT_CHANGE_SCORE + abs(indent - 282 last_indent), offset + pos - l_line, offset); 283 if (!n_words && !l_first_word) 284 /* first char of the line */ 285 if (!(c >= 'a' && c <= 'z')) 286 push(points, hd, &tl, NEWLINE_WITH_NON_LOWERCASE_SCORE - 287 NEWLINE_SCORE, offset + pos - l_line, offset); 288 if (n_words == 0) 289 l_first_word++; 290 if (c >= 'a' && c <= 'z' && 291 (last == ' ' || last == '\t' || last == '\n')) 292 word_has_uppercase = 0; 293 if (c >= 'A' && c <= 'Z') 294 word_has_uppercase = 1; 295 if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) 296 current += NON_ALPHA_SCORE; 297 // TODO more punct 298 if (c == '!' || c == '.' || c == ',' || c == ';' || c == ':' || c == '?' 299 || c == '(' || c == ')') 300 last_punct = c; 301 } 302 } 303 304 /* push point if we have one */ 305 if (current > 0) { 306 push(points, hd, &tl, current, offset + pos, offset); 307 } 308 } 309 310 free(piece); 311 free(points); 312 313 return 0; 314 } 315 316 int main(int argc, char **argv) { 317 318 if (argc < 2 || argc > 3) usage(argv); 319 size = atoi(argv[1]); 320 if (size <= 0) usage(argv); 321 if (argc == 3) min_confidence = atoi(argv[2]); 322 if (!min_confidence) usage(argv); 323 324 return split(size); 325 } 326