nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 9446c742fa5d1c71c844922673fd1f61c879a040
parent 0f21ebaa834f97af49ad835df3093af9d843731a
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Sun,  9 Oct 2011 21:41:17 +0200

fix eof detection

Diffstat:
nlsplit.c | 16++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -149,6 +149,8 @@ int split() { /* current and last character */ char c, last = 0; + /* for eof detection */ + int c_int; /* confidence for split at current position */ float current = 0; @@ -176,8 +178,9 @@ int split() { } /* read file, break after splitting at EOF */ + /* do not break when reading EOF, because we must output last chunk */ while (1) { - piece[(offset + (pos++)) % size] = (c = getchar()); + piece[(offset + (pos++)) % size] = (c = (c_int = getchar())); current = 0; @@ -265,9 +268,9 @@ int split() { assert(pos <= size); /* cut if we have to */ - if (c == EOF || pos == size || - (hd != tl && min_confidence > 0 - && points[hd].confidence >= min_confidence)) { + if ((c_int == EOF && pos != 1) || (c_int != EOF && pos == size) || + (hd != tl && min_confidence > 0 && + points[hd].confidence >= min_confidence)) { /* pop old entries */ while (hd != tl && points[hd].position <= offset) hd = (hd + 1) % size; @@ -277,7 +280,7 @@ int split() { assert(points[hd].position > offset); } assert(points[hd].position > offset); - if (c == EOF) { + if (c_int == EOF) { /* special EOF cut */ points[hd].confidence = EOF_SCORE; points[hd].position = offset + pos - 1; @@ -300,7 +303,8 @@ int split() { npiece++; } last = c; - if (c == EOF) break; + + if (c_int == EOF) break; } return 0;