nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 5e26236a13c2721026a2f8b142ce017de8be56a8
parent cef2279a0a5a7b4a5226dcb1ebd2b9b951f3650a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  9 Oct 2011 21:52:33 +0200

comment

Diffstat:
nlsplit.c | 15+++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -180,11 +180,15 @@ int split() { /* read file, break after splitting at EOF */ /* do not break when reading EOF, because we must output last chunk */ while (1) { + /* read char */ + last = c; c_int = getchar(); + c = c_int; - assert(pos <= size); /* cut if we have to */ + + assert(pos <= size); if (c_int == EOF || pos == size || (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) { @@ -220,9 +224,15 @@ int split() { npiece++; } + /* break if we must */ + if (c_int == EOF) break; - piece[(offset + (pos++)) % size] = (c = c_int); + /* add char */ + + piece[(offset + (pos++)) % size] = c; + + /* produce split points */ current = 0; @@ -304,6 +314,7 @@ int split() { } /* push point if we have one */ + if (current > 0) { push(points, hd, &tl, current, offset + pos, offset); }