refactored, seems to work - nlsplit - split natural language text in chunks at reasonable language boundaries

commit 42f3053e824f20b198a2bd4e0485f347b4f60949
parent 337d2eec0e332936ebd4e8317dd8495f8fb175d4
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 22:37:29 +0200

refactored, seems to work

Diffstat:
nlsplit.c  | 47 ++++++++++++++++++++++++++++++++++-------------

1 file changed, 34 insertions(+), 13 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -6,17 +6,22 @@
  * maximal size, with simple sensible heuristics. */
 
 /* Above this confidence value, we always split */
-/* TODO make it a CLI arg */
-#define MIN_CONFIDENCE 99999
+float min_confidence = -1;
 
 #define TAB_LEN 2
-#define EOF_CONFIDENCE 999
+
+#define EOF_SCORE 999999.
+#define MULTIPLE_LINE_SCORE 1000.
+#define LONG_LINES_THRESHOLD 100
+#define DELTA_THRESHOLD 10
+#define INDENT_CHANGE_SCORE 18
 #define MAX_PUNCT_SITUATION_SCORE 3
-#define LINE_VALUE 100
-#define WHITESPACE_SCORE 1
 #define FINAL_PUNCT_SCORE 20
 #define SEMIFINAL_PUNCT_SCORE 18
 #define OTHER_PUNCT_SCORE 15
+#define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
+#define NEWLINE_SCORE 2
+#define WHITESPACE_SCORE 1
 
 #define MAX(a, b) (((a) > (b)) ? (a) : (b))
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -125,8 +130,13 @@ int split(int size) {
         n_newlines = 1;
       // TODO more value to single lines when max_line_len is >> 80
       //current += LINE_VALUE * n_newlines;
-      if (n_newlines > 1)
-        current += LINE_VALUE * n_newlines;
+      if (n_newlines > 1) {
+        current += MULTIPLE_LINE_SCORE * n_newlines;
+      } else {
+        if (max_l_line > LONG_LINES_THRESHOLD)
+          current += (max_l_line - LONG_LINES_THRESHOLD);
+        current += NEWLINE_SCORE;
+      }
       if (last_punct) whitespace_after_punct++;
     } else {
       if (c == ' ' || c == '\t') {
@@ -138,7 +148,7 @@ int split(int size) {
             int delta = max_l_line - l_last_line - l_first_word - 1;
             //printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n",
             //    max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta);
-            if (delta > 0) {
+            if (delta > DELTA_THRESHOLD) {
               // first word of current line would fit on previous line
               if (offset + pos - l_line - 1 > offset)
                 push(points, hd, &tl, size, delta, offset + pos - l_line - 1);
@@ -163,8 +173,13 @@ int split(int size) {
         // TODO if first word fits
         if (indent != last_indent)
           if (offset + pos - l_line > offset)
-            push(points, hd, &tl, size, abs(indent - last_indent), offset + pos - l_line);
+            push(points, hd, &tl, size, INDENT_CHANGE_SCORE + abs(indent - last_indent), offset + pos - l_line);
         n_newlines = 0;
+        if (!n_words && !l_first_word)
+          // first char of the line
+          if (!(c >= 'a' && c <= 'z'))
+            if (offset + pos - l_line > offset)
+              push(points, hd, &tl, size, NEWLINE_WITH_NON_LOWERCASE_SCORE - NEWLINE_SCORE , offset + pos - l_line);
         if (n_words == 0)
           l_first_word++;
         if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
@@ -184,7 +199,8 @@ int split(int size) {
 
     assert(pos <= size);
     /* cut if we have to */
-    if (c == EOF || pos == size || (hd != tl && points[hd].confidence > MIN_CONFIDENCE)) {
+    if (c == EOF || pos == size ||
+        (hd != tl && min_confidence > 0 && points[hd].confidence > min_confidence)) {
       // pop old entries
       while (hd != tl && points[hd].position <= offset)
         hd = (hd + 1) % size;
@@ -196,9 +212,9 @@ int split(int size) {
       assert(points[hd].position > offset);
       if (c == EOF) {
         // special EOF cut
-        push(points, hd, &tl, size, EOF_CONFIDENCE, offset + pos - 1);
+        points[hd].confidence = EOF_SCORE;
+        points[hd].position = offset + pos - 1;
       }
-      // TODO wtf '%' ?
       //printf("== %d %d\n", offset, pos);
       printf("-- %d %ld %f\n",
           nchunk, points[hd].position - offset, points[hd].confidence);
@@ -270,8 +286,13 @@ int tests() {
 int main(int argc, char **argv) {
   int size;
 
-  if (argc != 2) usage(argv);
+  if (argc < 2 || argc > 3) usage(argv);
   size = atoi(argv[1]);
+  if (argc == 3) {
+    min_confidence = atoi(argv[2]);
+    if (!min_confidence) usage(argv);
+  }
+
   if (size <= 0) usage(argv);
 
   //return tests();

	nlsplit split natural language text in chunks at reasonable language boundaries
	git clone https://a3nm.net/git/nlsplit/
	Log \| Files \| Refs \| README