continue - nlsplit - split natural language text in chunks at reasonable language boundaries

commit 8490c583570a2120232c986804c9fdb1bbf39f61
parent bc6dbd4791c68e9460556665759b210a944c9401
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Fri,  7 Oct 2011 19:51:52 +0200

continue

Diffstat:
nlsplit.c  | 24 +++++++++++++++---------

1 file changed, 15 insertions(+), 9 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -33,7 +33,7 @@ int size;
 #define SEMIFINAL_PUNCT_SCORE 18
 /* punctuations where split isn't really safe */
 #define OTHER_PUNCT_SCORE 10
-/* whitespace and case configurations can help punctuation splits up to this limit */
+/* whitespace and case can help punctuation splits up to this limit */
 #define MAX_PUNCT_SITUATION_SCORE 3
 /* one \n with next line starting by non-lowercase is a safer split */
 #define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
@@ -70,12 +70,12 @@ float punct_score(char c) {
 
 float get(point *points, int hd, int tl, long position) {
   /* binary search of position in circular buffer (points, hd, tl) */
-  /* return the confidence, or 0 if none is found */
+  /* return the position of the best candidate */
 
-  if (hd == tl) return 0;
+  if (hd == tl) return hd;
   int m = MID(hd, tl, size);
   if (points[m].position == position)
-    return points[m].confidence;
+    return m;
   if (points[m].position > position)
     return get(points, hd, m, position);
   else
@@ -105,7 +105,8 @@ int push2(point *points, int hd, int tl, float confidence, long position) {
     return push2(points, (m+1) % size, tl, confidence, position);
 }
 
-int push(point *points, int hd, int *tl, float confidence, long position, long offset) {
+int push(point *points, int hd, int *tl, float confidence, long position,
+    long offset) {
   /* insert (confidence, position) in circular buffer (points, hd, tl) */
   /* if position is already in buffer, insert sum of confidence values */
   /* perform update of tl, return new value of tl */
@@ -116,7 +117,9 @@ int push(point *points, int hd, int *tl, float confidence, long position, long o
   // TODO check order
   /* refuse insert of positions before offset */
   if (position <= offset) return *tl;
-  float old = get(points, hd, *tl, position);
+  int old_pos = get(points, hd, *tl, position);
+  float old = points[old_pos].position == position ?
+      points[old_pos].confidence : 0;
   return (*tl = push2(points, hd, *tl, confidence + old, position));
 }
 
@@ -234,14 +237,16 @@ int split() {
                   NEWLINE_SCORE , offset + pos - l_line, offset);
         if (n_words == 0)
           l_first_word++;
-        if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
+        if (c >= 'a' && c <= 'z' &&
+            (last == ' ' || last == '\t' || last == '\n'))
           word_has_uppercase = 0;
         if (c >= 'A' && c <= 'Z')
           word_has_uppercase = 1;
         if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z'))
           current += NON_ALPHA_SCORE;
         // TODO more punct
-        if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')')
+        if (c == '!' || c == '.' || c == ',' || c == ';' || c == ':' || c == '?'
+            || c == '(' || c == ')')
           last_punct = c;
       }
       l_line++;
@@ -255,7 +260,8 @@ int split() {
     assert(pos <= size);
     /* cut if we have to */
     if (c == EOF || pos == size ||
-        (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) {
+        (hd != tl && min_confidence > 0
+         && points[hd].confidence >= min_confidence)) {
       /* pop old entries */
       while (hd != tl && points[hd].position <= offset)
         hd = (hd + 1) % size;

	nlsplit split natural language text in chunks at reasonable language boundaries
	git clone https://a3nm.net/git/nlsplit/
	Log \| Files \| Refs \| README