nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit cef2279a0a5a7b4a5226dcb1ebd2b9b951f3650a
parent 9446c742fa5d1c71c844922673fd1f61c879a040
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  9 Oct 2011 21:50:12 +0200

fix order of operations

Diffstat:
nlsplit.c | 84++++++++++++++++++++++++++++++++++++++++---------------------------------------
1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -180,7 +180,49 @@ int split() { /* read file, break after splitting at EOF */ /* do not break when reading EOF, because we must output last chunk */ while (1) { - piece[(offset + (pos++)) % size] = (c = (c_int = getchar())); + last = c; + c_int = getchar(); + + assert(pos <= size); + /* cut if we have to */ + if (c_int == EOF || pos == size || + (hd != tl && min_confidence > 0 && + points[hd].confidence >= min_confidence)) { + /* pop old entries */ + while (hd != tl && points[hd].position <= offset) + hd = (hd + 1) % size; + if (hd == tl) { + /* we have no points, we must create one */ + push(points, hd, &tl, 0, offset + pos, offset); + assert(points[hd].position > offset); + } + assert(points[hd].position > offset); + if (c_int == EOF) { + /* special EOF cut */ + points[hd].confidence = EOF_SCORE; + points[hd].position = offset + pos; + } + //printf("== %d %d\n", offset, pos); + printf("-- piece %d length %ld confidence %f\n", + npiece, points[hd].position - offset, points[hd].confidence); + /* output the data */ + for (i = offset; i < points[hd].position ; i++) + putchar(piece[i % size]); + putchar('\n'); + /* update offset and pos */ + pos = (offset + pos) - points[hd].position; + assert(pos < size); + offset = points[hd].position; + //printf("== %d %d\n", offset, pos); + /* pop the point */ + hd = (hd + 1) % size; + /* increment piece counter */ + npiece++; + } + + if (c_int == EOF) break; + + piece[(offset + (pos++)) % size] = (c = c_int); current = 0; @@ -265,46 +307,6 @@ int split() { if (current > 0) { push(points, hd, &tl, current, offset + pos, offset); } - - assert(pos <= size); - /* cut if we have to */ - if ((c_int == EOF && pos != 1) || (c_int != EOF && pos == size) || - (hd != tl && min_confidence > 0 && - points[hd].confidence >= min_confidence)) { - /* pop old entries */ - while (hd != tl && points[hd].position <= offset) - hd = (hd + 1) % size; - if (hd == tl) { - /* we have no points, we must create one */ - push(points, hd, &tl, 0, offset + pos, offset); - assert(points[hd].position > offset); - } - assert(points[hd].position > offset); - if (c_int == EOF) { - /* special EOF cut */ - points[hd].confidence = EOF_SCORE; - points[hd].position = offset + pos - 1; - } - //printf("== %d %d\n", offset, pos); - printf("-- piece %d length %ld confidence %f\n", - npiece, points[hd].position - offset, points[hd].confidence); - /* output the data */ - for (i = offset; i < points[hd].position ; i++) - putchar(piece[i % size]); - putchar('\n'); - /* update offset and pos */ - pos = (offset + pos) - points[hd].position; - assert(pos < size); - offset = points[hd].position; - //printf("== %d %d\n", offset, pos); - /* pop the point */ - hd = (hd + 1) % size; - /* increment piece counter */ - npiece++; - } - last = c; - - if (c_int == EOF) break; } return 0;