commit 8490c583570a2120232c986804c9fdb1bbf39f61
parent bc6dbd4791c68e9460556665759b210a944c9401
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Fri, 7 Oct 2011 19:51:52 +0200
continue
Diffstat:
1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -33,7 +33,7 @@ int size;
#define SEMIFINAL_PUNCT_SCORE 18
/* punctuations where split isn't really safe */
#define OTHER_PUNCT_SCORE 10
-/* whitespace and case configurations can help punctuation splits up to this limit */
+/* whitespace and case can help punctuation splits up to this limit */
#define MAX_PUNCT_SITUATION_SCORE 3
/* one \n with next line starting by non-lowercase is a safer split */
#define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
@@ -70,12 +70,12 @@ float punct_score(char c) {
float get(point *points, int hd, int tl, long position) {
/* binary search of position in circular buffer (points, hd, tl) */
- /* return the confidence, or 0 if none is found */
+ /* return the position of the best candidate */
- if (hd == tl) return 0;
+ if (hd == tl) return hd;
int m = MID(hd, tl, size);
if (points[m].position == position)
- return points[m].confidence;
+ return m;
if (points[m].position > position)
return get(points, hd, m, position);
else
@@ -105,7 +105,8 @@ int push2(point *points, int hd, int tl, float confidence, long position) {
return push2(points, (m+1) % size, tl, confidence, position);
}
-int push(point *points, int hd, int *tl, float confidence, long position, long offset) {
+int push(point *points, int hd, int *tl, float confidence, long position,
+ long offset) {
/* insert (confidence, position) in circular buffer (points, hd, tl) */
/* if position is already in buffer, insert sum of confidence values */
/* perform update of tl, return new value of tl */
@@ -116,7 +117,9 @@ int push(point *points, int hd, int *tl, float confidence, long position, long o
// TODO check order
/* refuse insert of positions before offset */
if (position <= offset) return *tl;
- float old = get(points, hd, *tl, position);
+ int old_pos = get(points, hd, *tl, position);
+ float old = points[old_pos].position == position ?
+ points[old_pos].confidence : 0;
return (*tl = push2(points, hd, *tl, confidence + old, position));
}
@@ -234,14 +237,16 @@ int split() {
NEWLINE_SCORE , offset + pos - l_line, offset);
if (n_words == 0)
l_first_word++;
- if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
+ if (c >= 'a' && c <= 'z' &&
+ (last == ' ' || last == '\t' || last == '\n'))
word_has_uppercase = 0;
if (c >= 'A' && c <= 'Z')
word_has_uppercase = 1;
if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z'))
current += NON_ALPHA_SCORE;
// TODO more punct
- if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')')
+ if (c == '!' || c == '.' || c == ',' || c == ';' || c == ':' || c == '?'
+ || c == '(' || c == ')')
last_punct = c;
}
l_line++;
@@ -255,7 +260,8 @@ int split() {
assert(pos <= size);
/* cut if we have to */
if (c == EOF || pos == size ||
- (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) {
+ (hd != tl && min_confidence > 0
+ && points[hd].confidence >= min_confidence)) {
/* pop old entries */
while (hd != tl && points[hd].position <= offset)
hd = (hd + 1) % size;