nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 337d2eec0e332936ebd4e8317dd8495f8fb175d4
parent 6e691b2400272e5c72d403124118a6c32f923f12
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 20:32:12 +0200

first version

Diffstat:
nlsplit.c | 202++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 169 insertions(+), 33 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -1,15 +1,25 @@ #include <stdio.h> +#include <stdlib.h> +#include <assert.h> /* Split latin alphabet natural language text in fragments below a * maximal size, with simple sensible heuristics. */ /* Above this confidence value, we always split */ /* TODO make it a CLI arg */ -#define MIN_CONFIDENCE 2 +#define MIN_CONFIDENCE 99999 #define TAB_LEN 2 +#define EOF_CONFIDENCE 999 +#define MAX_PUNCT_SITUATION_SCORE 3 +#define LINE_VALUE 100 +#define WHITESPACE_SCORE 1 +#define FINAL_PUNCT_SCORE 20 +#define SEMIFINAL_PUNCT_SCORE 18 +#define OTHER_PUNCT_SCORE 15 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) typedef struct point { float confidence; @@ -21,56 +31,72 @@ void usage(char** argv) { exit(1); } - // TODO: keep last insertion to insert new entry with combined score int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) { + //printf(". %d %d\n", hd, tl); + assert(hd < size); + assert(tl < size); if (hd == tl) { points[tl].position = position; - if (tl == *otl && points[tl].confidence > confidence) { + if (tl == *otl) { points[tl].confidence = confidence; - tl = tl + 1 % size; + //tl = tl + 1 % size; } else { points[tl].confidence = confidence; } - *otl = tl; + *otl = (tl + 1) % size; + assert(*otl < size); return tl; } else { - if (hd > tl) tl += size; - int m = (hd + tl + (hd > tl ? size : 0))/2 % size; + int m = ((hd + (tl + (hd > tl ? size : 0)))/2) % size; if (points[m].confidence <= confidence) return push2(points, hd, m, otl, size, confidence, position); else - return push2(points, m, tl, otl, size, confidence, position); + return push2(points, (m+1) % size, tl, otl, size, confidence, position); } } -int push(point *points, int hd, int *tl, int size, float confidence, long position) { - return push2(points, hd, *tl, tl, size, confidence + get(points, hd, (tl, size, position)), position); -} - float get(point *points, int hd, int tl, int size, long position) { if (hd == tl) return 0; - int m = (hd + tl + (hd > tl ? size : 0))/2 % size; + int m = ((hd + tl + (hd > tl ? size : 0))/2) % size; if (points[m].position == position) return points[m].confidence; if (points[m].position > position) - return get(points, hd, m, otl, size, confidence, position); + return get(points, hd, m, size, position); else - return get(points, m, tl, otl, size, confidence, position); + return get(points, (m+1) % size, tl, size, position); +} + + +int push(point *points, int hd, int *tl, int size, float confidence, long position) { + assert(hd < size); + assert(*tl < size); + //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl); + float old = get(points, hd, *tl, size, position); + return push2(points, hd, *tl, tl, size, confidence + old, position); +} + +float punct_score(char c) { + /* final punctuation */ + if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE; + if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE; + return OTHER_PUNCT_SCORE; } + int split(int size) { char *chunk; point *points; - int hd, tl; // position of head and tail in the circular list points + int hd = 0, tl = 0; // position of head and tail in the circular list points char c, last = 0; - state s; int pos = 0, offset = 0; - int current = 0; + float current = 0; int nchunk = 0; - int indent = 0; last_indent = 0, n_newlines = 0, l_line = 0, - max_line = 0, l_first_word = 0; + int indent = 0, last_indent = 0, n_newlines = 0, l_line = 0, l_last_line = 0, + max_l_line = 0, l_first_word = 0; int n_words = 0; // will be 0 or 1 + int word_has_uppercase; + int last_punct = 0, whitespace_after_punct = 0; int i; chunk = malloc(size * sizeof(char)); @@ -80,7 +106,9 @@ int split(int size) { return 1; } - while (chunk[offset + (pos++) % size] = (c = getchar())) { + while (1) { + c = getchar(); + chunk[(offset + (pos++)) % size] = c; current = 0; if (c == '\n') { @@ -88,26 +116,65 @@ int split(int size) { l_first_word = 0; last_indent = indent; indent = 0; - max_line = MAX(max_line, line); + max_l_line = MAX(max_l_line, l_line); + l_last_line = l_line; l_line = 0; if (last == '\n') n_newlines++; else n_newlines = 1; - current += n_newlines; + // TODO more value to single lines when max_line_len is >> 80 + //current += LINE_VALUE * n_newlines; + if (n_newlines > 1) + current += LINE_VALUE * n_newlines; + if (last_punct) whitespace_after_punct++; } else { - l_line++; if (c == ' ' || c == '\t') { + current += WHITESPACE_SCORE; if (l_first_word == 0) { indent += c == '\t' ? 2 : 1; } else { + if (!n_words) { + int delta = max_l_line - l_last_line - l_first_word - 1; + //printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n", + // max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta); + if (delta > 0) { + // first word of current line would fit on previous line + if (offset + pos - l_line - 1 > offset) + push(points, hd, &tl, size, delta, offset + pos - l_line - 1); + } + } n_words = 1; } + if (last_punct) whitespace_after_punct++; } else { + if (last_punct) { + // the best situation for a punctuation is to be followed by + // much whitespace and then an uppercase letter, and have a + // preceding word without uppercase ("Mr. Brown") + int situation_score = MIN(whitespace_after_punct + ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1), MAX_PUNCT_SITUATION_SCORE); + if (offset + pos - whitespace_after_punct > offset) { + push(points, hd, &tl, size, situation_score + punct_score(last_punct), offset + pos - whitespace_after_punct); + } + whitespace_after_punct = 0; + last_punct = 0; + } + + // TODO if first word fits + if (indent != last_indent) + if (offset + pos - l_line > offset) + push(points, hd, &tl, size, abs(indent - last_indent), offset + pos - l_line); n_newlines = 0; if (n_words == 0) l_first_word++; + if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n')) + word_has_uppercase = 0; + if (c >= 'A' && c <= 'Z') + word_has_uppercase = 1; + if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')') + last_punct = c; } + l_line++; } /* push point if we have one */ @@ -115,30 +182,99 @@ int split(int size) { push(points, hd, &tl, size, current, offset + pos); } + assert(pos <= size); /* cut if we have to */ - if (pos == size || points[hd].confidence > MIN_CONFIDENCE) { - assert(hd != tl); - printf("-- %d %d %f\n", - nchunk, size + best - pos % size, points[hd].confidence); - for (i = offset; i < offset + best ; i++) - putchar(chunk[offset + i]); + if (c == EOF || pos == size || (hd != tl && points[hd].confidence > MIN_CONFIDENCE)) { + // pop old entries + while (hd != tl && points[hd].position <= offset) + hd = (hd + 1) % size; + if (hd == tl) { + // we have no possibility to cut, we must add one + push(points, hd, &tl, size, 0, offset + pos); + assert(points[hd].position > offset); + } + assert(points[hd].position > offset); + if (c == EOF) { + // special EOF cut + push(points, hd, &tl, size, EOF_CONFIDENCE, offset + pos - 1); + } + // TODO wtf '%' ? + //printf("== %d %d\n", offset, pos); + printf("-- %d %ld %f\n", + nchunk, points[hd].position - offset, points[hd].confidence); + for (i = offset; i < points[hd].position ; i++) + putchar(chunk[i % size]); + putchar('\n'); + pos = (offset + pos) - points[hd].position; + assert(pos < size); offset = points[hd].position; + //printf("== %d %d\n", offset, pos); nchunk++; - hd = hd + 1 % size; + hd = (hd + 1) % size; } last = c; + if (c == EOF) break; } + //printf("==done %d %d\n", offset, pos); + return 0; +} +void print_points(point *points, int hd, int tl, int size) { + int i; + printf("size %d hd %d tl %d\n", size, hd, tl); + for (i=0; i<(tl-hd+size)%size; i++) + printf("> item %d pos %d; %f %ld\n", i, (hd+i)%size, points[(hd+i)%size].confidence, points[(hd+i)%size].position); +} - - +int tests() { + float f; + const int size = 3; + int pos; + point p[size]; + int hd = 0, tl = 0; + f = get(p, hd, tl, size, 2); + print_points(p, hd, tl, size); + push2(p, hd, tl, &tl, size, 1, 1); + print_points(p, hd, tl, size); + f = get(p, hd, tl, size, 1); + printf("%f\n", f); + push(p, hd, &tl, size, 1, 1); + print_points(p, hd, tl, size); + push(p, hd, &tl, size, 1, 1); + print_points(p, hd, tl, size); + f = get(p, hd, tl, size, 2); + printf("%f\n", f); + push(p, hd, &tl, size, 2, 4); + print_points(p, hd, tl, size); + push(p, hd, &tl, size, 3, 2); + print_points(p, hd, tl, size); + push(p, hd, &tl, size, 4, 1); + print_points(p, hd, tl, size); + printf("pop!\n"); + hd = (hd + 1) % size; + push(p, hd, &tl, size, 4, 2); + print_points(p, hd, tl, size); + pos = push(p, hd, &tl, size, 1, 4242); + printf("inserted at %d\n", pos); + print_points(p, hd, tl, size); + printf("pop!\n"); + hd = (hd + 1) % size; + push(p, hd, &tl, size, 4, 4); + print_points(p, hd, tl, size); + printf("pop!\n"); + hd = (hd + 1) % size; + return 0; +} int main(int argc, char **argv) { + int size; + if (argc != 2) usage(argv); size = atoi(argv[1]); if (size <= 0) usage(argv); + //return tests(); return split(size); }