nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit a17b7c76a1711b656324f9839b757d9c5db030d8
parent 42f3053e824f20b198a2bd4e0485f347b4f60949
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 23:47:58 +0200

cleanup, comment

Diffstat:
nlsplit.c | 303++++++++++++++++++++++++++++++++++++++++---------------------------------------
1 file changed, 153 insertions(+), 150 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -1,135 +1,188 @@ +/* nlsplit by a3nm (2011) */ + #include <stdio.h> #include <stdlib.h> #include <assert.h> -/* Split latin alphabet natural language text in fragments below a - * maximal size, with simple sensible heuristics. */ +const char help[] = + "Split natural language text from stdin in pieces less than SIZE bytes.\n" + "If MINCONFIDENCE is set, always cut above this confidence threshold.\n"; -/* Above this confidence value, we always split */ +/* above this confidence value, we always split */ float min_confidence = -1; +/* maximal size of pieces */ +int size; + +/* for tab-space equivalence in indentation counts */ #define TAB_LEN 2 -#define EOF_SCORE 999999. -#define MULTIPLE_LINE_SCORE 1000. +/* score of the last split */ +#define EOF_SCORE 999999 +/* score per \n, for multiple \n's */ +#define MULTIPLE_LINE_SCORE 1000 +/* if file contains lines longer than this, then one \n is a safer split */ #define LONG_LINES_THRESHOLD 100 +/* if first word of line fits on previous line with this margin, safer split */ #define DELTA_THRESHOLD 10 +/* if indentation changes, safer split */ #define INDENT_CHANGE_SCORE 18 -#define MAX_PUNCT_SITUATION_SCORE 3 +/* punctuations where split is safe */ #define FINAL_PUNCT_SCORE 20 +/* punctuations where split is less safe */ #define SEMIFINAL_PUNCT_SCORE 18 -#define OTHER_PUNCT_SCORE 15 +/* punctuations where split isn't really safe */ +#define OTHER_PUNCT_SCORE 10 +/* whitespace and case configurations can help punctuation splits up to this limit */ +#define MAX_PUNCT_SITUATION_SCORE 3 +/* one \n with next line starting by non-lowercase is a safer split */ #define NEWLINE_WITH_NON_LOWERCASE_SCORE 3 +/* \n is a last-resort split */ #define NEWLINE_SCORE 2 +/* whitespace is a last-resort split */ #define WHITESPACE_SCORE 1 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#define MID(l, r, s) ((((l) + ((r) + ((l) > (r) ? (s) : 0)))/2) % (s)) +/* a possible split point */ typedef struct point { float confidence; long position; } point; void usage(char** argv) { - fprintf(stderr, "Usage: %s BYTES\n", argv[0]); + /* show usage and exit */ + fprintf(stderr, "Usage: %s SIZE [MINCONFIDENCE]\n", argv[0]); + fprintf(stderr, help); exit(1); } -// TODO: keep last insertion to insert new entry with combined score -int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) { - //printf(". %d %d\n", hd, tl); - assert(hd < size); - assert(tl < size); - if (hd == tl) { - points[tl].position = position; - if (tl == *otl) { - points[tl].confidence = confidence; - //tl = tl + 1 % size; - } else { - points[tl].confidence = confidence; - } - *otl = (tl + 1) % size; - assert(*otl < size); - return tl; - } else { - int m = ((hd + (tl + (hd > tl ? size : 0)))/2) % size; - if (points[m].confidence <= confidence) - return push2(points, hd, m, otl, size, confidence, position); - else - return push2(points, (m+1) % size, tl, otl, size, confidence, position); - } +float punct_score(char c) { + /* get score of a punctuation */ + if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE; + if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE; + return OTHER_PUNCT_SCORE; } -float get(point *points, int hd, int tl, int size, long position) { +float get(point *points, int hd, int tl, long position) { + /* binary search of position in circular buffer (points, hd, tl) */ + /* return the confidence, or 0 if none is found */ + if (hd == tl) return 0; - int m = ((hd + tl + (hd > tl ? size : 0))/2) % size; + int m = MID(hd, tl, size); if (points[m].position == position) return points[m].confidence; if (points[m].position > position) - return get(points, hd, m, size, position); + return get(points, hd, m, position); else - return get(points, (m+1) % size, tl, size, position); + return get(points, (m+1) % size, tl, position); } +int push2(point *points, int hd, int tl, float confidence, long position) { + /* insert (confidence, position) in circular buffer (points, hd, tl) */ + /* buffer is kept sorted, existing elements below confidence are erased) */ + /* insertion position is found by binary search on confidence */ + /* return new value for tl */ -int push(point *points, int hd, int *tl, int size, float confidence, long position) { assert(hd < size); - assert(*tl < size); - //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl); - float old = get(points, hd, *tl, size, position); - return push2(points, hd, *tl, tl, size, confidence + old, position); + assert(tl < size); + if (hd == tl) { + /* insert at tl */ + points[tl].position = position; + points[tl].confidence = confidence; + /* increment otl */ + return (tl + 1) % size; + } + int m = MID(hd, tl, size); + /* in case of tie, drop elements */ + if (points[m].confidence <= confidence) + return push2(points, hd, m, confidence, position); + else + return push2(points, (m+1) % size, tl, confidence, position); } -float punct_score(char c) { - /* final punctuation */ - if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE; - if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE; - return OTHER_PUNCT_SCORE; +int push(point *points, int hd, int *tl, float confidence, long position, long offset) { + /* insert (confidence, position) in circular buffer (points, hd, tl) */ + /* if position is already in buffer, insert sum of confidence values */ + /* perform update of tl, return new value of tl */ + + //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl); + assert(hd < size); + assert(*tl < size); + // TODO check order + /* refuse insert of positions before offset */ + if (position <= offset) return *tl; + float old = get(points, hd, *tl, position); + return (*tl = push2(points, hd, *tl, confidence + old, position)); } -int split(int size) { - char *chunk; +int split() { + /* perform the split */ + + /* counter of pieces */ + int npiece = 0; + + /* circular buffer for the current piece */ + char *piece; + /* offset is the absolute offset relative to BOF */ + /* pos is the offset within the piece */ + int offset = 0, pos = 0; + + /* circular sorted buffer for the current split points */ + /* this is kept sorted by decreasing confidence and increasing position */ point *points; - int hd = 0, tl = 0; // position of head and tail in the circular list points + /* head and tail of the buffer */ + int hd = 0, tl = 0; + + /* current and last character */ char c, last = 0; - int pos = 0, offset = 0; + + /* confidence for split at current position */ float current = 0; - int nchunk = 0; - int indent = 0, last_indent = 0, n_newlines = 0, l_line = 0, l_last_line = 0, - max_l_line = 0, l_first_word = 0; - int n_words = 0; // will be 0 or 1 - int word_has_uppercase; + + /* number of successive newlines, indentation of current and last line */ + int n_newlines = 0, indent = 0, last_indent = 0; + + /* length of current and last line, maximal line length */ + int l_line = 0, l_last_line = 0, max_l_line = 0; + + /* length of first word, number of words on line, uppercase in current word */ + int l_first_word = 0, n_words = 0, word_has_uppercase = 0; + + /* last punctuation seen, number of whitespace characters after punctuation */ int last_punct = 0, whitespace_after_punct = 0; + int i; - chunk = malloc(size * sizeof(char)); + /* allocate memory */ + piece = malloc(size * sizeof(char)); points = malloc(size * sizeof(point)); - if (!chunk || !points) { + if (!piece || !points) { perror("malloc"); return 1; } + /* read file, break after splitting at EOF */ while (1) { - c = getchar(); - chunk[(offset + (pos++)) % size] = c; + piece[(offset + (pos++)) % size] = (c = getchar()); + current = 0; + if (c == '\n' || c == ' ' || c == '\t') + if (last_punct) whitespace_after_punct++; + if (c == '\n') { n_words = 0; l_first_word = 0; last_indent = indent; indent = 0; - max_l_line = MAX(max_l_line, l_line); l_last_line = l_line; l_line = 0; - if (last == '\n') - n_newlines++; - else - n_newlines = 1; - // TODO more value to single lines when max_line_len is >> 80 - //current += LINE_VALUE * n_newlines; + max_l_line = MAX(max_l_line, l_line); + n_newlines++; if (n_newlines > 1) { current += MULTIPLE_LINE_SCORE * n_newlines; } else { @@ -137,55 +190,54 @@ int split(int size) { current += (max_l_line - LONG_LINES_THRESHOLD); current += NEWLINE_SCORE; } - if (last_punct) whitespace_after_punct++; } else { + n_newlines = 0; if (c == ' ' || c == '\t') { current += WHITESPACE_SCORE; if (l_first_word == 0) { + /* we are still in indentation */ indent += c == '\t' ? 2 : 1; } else { if (!n_words) { + /* we have just read the first word */ int delta = max_l_line - l_last_line - l_first_word - 1; //printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n", // max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta); if (delta > DELTA_THRESHOLD) { - // first word of current line would fit on previous line - if (offset + pos - l_line - 1 > offset) - push(points, hd, &tl, size, delta, offset + pos - l_line - 1); + /* first word of current line would fit on previous line */ + push(points, hd, &tl, delta, offset + pos - l_line - 1, offset); } } - n_words = 1; + n_words += 1; } - if (last_punct) whitespace_after_punct++; } else { if (last_punct) { - // the best situation for a punctuation is to be followed by - // much whitespace and then an uppercase letter, and have a - // preceding word without uppercase ("Mr. Brown") - int situation_score = MIN(whitespace_after_punct + ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1), MAX_PUNCT_SITUATION_SCORE); - if (offset + pos - whitespace_after_punct > offset) { - push(points, hd, &tl, size, situation_score + punct_score(last_punct), offset + pos - whitespace_after_punct); - } + /* punctuation followed by much whitespace and an uppercase letter */ + /* and having a preceding word without uppercase is safer */ + int situation_score = MIN(MAX_PUNCT_SITUATION_SCORE, + whitespace_after_punct + + ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1)); + push(points, hd, &tl, situation_score + punct_score(last_punct), + offset + pos - whitespace_after_punct, offset); whitespace_after_punct = 0; last_punct = 0; } - // TODO if first word fits if (indent != last_indent) - if (offset + pos - l_line > offset) - push(points, hd, &tl, size, INDENT_CHANGE_SCORE + abs(indent - last_indent), offset + pos - l_line); - n_newlines = 0; + push(points, hd, &tl, INDENT_CHANGE_SCORE + abs(indent - + last_indent), offset + pos - l_line, offset); if (!n_words && !l_first_word) - // first char of the line + /* first char of the line */ if (!(c >= 'a' && c <= 'z')) - if (offset + pos - l_line > offset) - push(points, hd, &tl, size, NEWLINE_WITH_NON_LOWERCASE_SCORE - NEWLINE_SCORE , offset + pos - l_line); + push(points, hd, &tl, NEWLINE_WITH_NON_LOWERCASE_SCORE - + NEWLINE_SCORE , offset + pos - l_line, offset); if (n_words == 0) l_first_word++; if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n')) word_has_uppercase = 0; if (c >= 'A' && c <= 'Z') word_has_uppercase = 1; + // TODO more punct if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')') last_punct = c; } @@ -194,108 +246,59 @@ int split(int size) { /* push point if we have one */ if (current > 0) { - push(points, hd, &tl, size, current, offset + pos); + push(points, hd, &tl, current, offset + pos, offset); } assert(pos <= size); /* cut if we have to */ if (c == EOF || pos == size || - (hd != tl && min_confidence > 0 && points[hd].confidence > min_confidence)) { - // pop old entries + (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) { + /* pop old entries */ while (hd != tl && points[hd].position <= offset) hd = (hd + 1) % size; if (hd == tl) { - // we have no possibility to cut, we must add one - push(points, hd, &tl, size, 0, offset + pos); + /* we have no points, we must create one */ + push(points, hd, &tl, 0, offset + pos, offset); assert(points[hd].position > offset); } assert(points[hd].position > offset); if (c == EOF) { - // special EOF cut + /* special EOF cut */ points[hd].confidence = EOF_SCORE; points[hd].position = offset + pos - 1; } //printf("== %d %d\n", offset, pos); - printf("-- %d %ld %f\n", - nchunk, points[hd].position - offset, points[hd].confidence); + printf("-- piece %d length %ld confidence %f\n", + npiece, points[hd].position - offset, points[hd].confidence); + /* output the data */ for (i = offset; i < points[hd].position ; i++) - putchar(chunk[i % size]); + putchar(piece[i % size]); putchar('\n'); + /* update offset and pos */ pos = (offset + pos) - points[hd].position; assert(pos < size); offset = points[hd].position; //printf("== %d %d\n", offset, pos); - nchunk++; + /* pop the point */ hd = (hd + 1) % size; + /* increment piece counter */ + npiece++; } last = c; if (c == EOF) break; } - //printf("==done %d %d\n", offset, pos); - - return 0; -} - -void print_points(point *points, int hd, int tl, int size) { - int i; - printf("size %d hd %d tl %d\n", size, hd, tl); - for (i=0; i<(tl-hd+size)%size; i++) - printf("> item %d pos %d; %f %ld\n", i, (hd+i)%size, points[(hd+i)%size].confidence, points[(hd+i)%size].position); -} -int tests() { - float f; - const int size = 3; - int pos; - point p[size]; - int hd = 0, tl = 0; - f = get(p, hd, tl, size, 2); - print_points(p, hd, tl, size); - push2(p, hd, tl, &tl, size, 1, 1); - print_points(p, hd, tl, size); - f = get(p, hd, tl, size, 1); - printf("%f\n", f); - push(p, hd, &tl, size, 1, 1); - print_points(p, hd, tl, size); - push(p, hd, &tl, size, 1, 1); - print_points(p, hd, tl, size); - f = get(p, hd, tl, size, 2); - printf("%f\n", f); - push(p, hd, &tl, size, 2, 4); - print_points(p, hd, tl, size); - push(p, hd, &tl, size, 3, 2); - print_points(p, hd, tl, size); - push(p, hd, &tl, size, 4, 1); - print_points(p, hd, tl, size); - printf("pop!\n"); - hd = (hd + 1) % size; - push(p, hd, &tl, size, 4, 2); - print_points(p, hd, tl, size); - pos = push(p, hd, &tl, size, 1, 4242); - printf("inserted at %d\n", pos); - print_points(p, hd, tl, size); - printf("pop!\n"); - hd = (hd + 1) % size; - push(p, hd, &tl, size, 4, 4); - print_points(p, hd, tl, size); - printf("pop!\n"); - hd = (hd + 1) % size; return 0; } int main(int argc, char **argv) { - int size; if (argc < 2 || argc > 3) usage(argv); size = atoi(argv[1]); - if (argc == 3) { - min_confidence = atoi(argv[2]); - if (!min_confidence) usage(argv); - } - if (size <= 0) usage(argv); + if (argc == 3) min_confidence = atoi(argv[2]); + if (!min_confidence) usage(argv); - //return tests(); return split(size); }