nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 8490c583570a2120232c986804c9fdb1bbf39f61
parent bc6dbd4791c68e9460556665759b210a944c9401
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri,  7 Oct 2011 19:51:52 +0200

continue

Diffstat:
nlsplit.c | 24+++++++++++++++---------
1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -33,7 +33,7 @@ int size; #define SEMIFINAL_PUNCT_SCORE 18 /* punctuations where split isn't really safe */ #define OTHER_PUNCT_SCORE 10 -/* whitespace and case configurations can help punctuation splits up to this limit */ +/* whitespace and case can help punctuation splits up to this limit */ #define MAX_PUNCT_SITUATION_SCORE 3 /* one \n with next line starting by non-lowercase is a safer split */ #define NEWLINE_WITH_NON_LOWERCASE_SCORE 3 @@ -70,12 +70,12 @@ float punct_score(char c) { float get(point *points, int hd, int tl, long position) { /* binary search of position in circular buffer (points, hd, tl) */ - /* return the confidence, or 0 if none is found */ + /* return the position of the best candidate */ - if (hd == tl) return 0; + if (hd == tl) return hd; int m = MID(hd, tl, size); if (points[m].position == position) - return points[m].confidence; + return m; if (points[m].position > position) return get(points, hd, m, position); else @@ -105,7 +105,8 @@ int push2(point *points, int hd, int tl, float confidence, long position) { return push2(points, (m+1) % size, tl, confidence, position); } -int push(point *points, int hd, int *tl, float confidence, long position, long offset) { +int push(point *points, int hd, int *tl, float confidence, long position, + long offset) { /* insert (confidence, position) in circular buffer (points, hd, tl) */ /* if position is already in buffer, insert sum of confidence values */ /* perform update of tl, return new value of tl */ @@ -116,7 +117,9 @@ int push(point *points, int hd, int *tl, float confidence, long position, long o // TODO check order /* refuse insert of positions before offset */ if (position <= offset) return *tl; - float old = get(points, hd, *tl, position); + int old_pos = get(points, hd, *tl, position); + float old = points[old_pos].position == position ? + points[old_pos].confidence : 0; return (*tl = push2(points, hd, *tl, confidence + old, position)); } @@ -234,14 +237,16 @@ int split() { NEWLINE_SCORE , offset + pos - l_line, offset); if (n_words == 0) l_first_word++; - if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n')) + if (c >= 'a' && c <= 'z' && + (last == ' ' || last == '\t' || last == '\n')) word_has_uppercase = 0; if (c >= 'A' && c <= 'Z') word_has_uppercase = 1; if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) current += NON_ALPHA_SCORE; // TODO more punct - if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')') + if (c == '!' || c == '.' || c == ',' || c == ';' || c == ':' || c == '?' + || c == '(' || c == ')') last_punct = c; } l_line++; @@ -255,7 +260,8 @@ int split() { assert(pos <= size); /* cut if we have to */ if (c == EOF || pos == size || - (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) { + (hd != tl && min_confidence > 0 + && points[hd].confidence >= min_confidence)) { /* pop old entries */ while (hd != tl && points[hd].position <= offset) hd = (hd + 1) % size;