initial commit - nlsplit - split natural language text in chunks at reasonable language boundaries

commit 20444efabe6dfe60dd0e961c9d84c271e3ff6446
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 00:17:50 +0200

initial commit

Diffstat:
nlsplit.c  | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 144 insertions(+), 0 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -0,0 +1,144 @@
+#include <stdio.h>
+
+/* Split latin alphabet natural language text in fragments below a
+ * maximal size, with simple sensible heuristics. */
+
+/* Above this confidence value, we always split */
+/* TODO make it a CLI arg */
+#define MIN_CONFIDENCE 2
+
+#define TAB_LEN 2
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+typedef struct point {
+  float confidence;
+  long position;
+} point;
+
+void usage(char** argv) {
+  fprintf(stderr, "Usage: %s BYTES\n", argv[0]);
+  exit(1);
+}
+
+
+// TODO: keep last insertion to insert new entry with combined score
+int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) {
+  if (hd == tl) {
+    points[tl].position = position;
+    if (tl == *otl && points[tl].confidence > confidence) {
+      points[tl].confidence = confidence;
+      tl = tl + 1 % size;
+    } else {
+      points[*tl].confidence = confidence;
+    }
+    *otl = tl;
+    return tl;
+  } else {
+    if (hd > tl) tl += size;
+    int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+    if (points[m].confidence <= confidence)
+      return push2(points, hd, m, otl, size, confidence, position);
+    else
+      return push2(points, m, tl, otl, size, confidence, position);
+  }
+}
+
+int push(point *points, int hd, int *tl, int size, float confidence, long position) {
+  return push2(points, hd, *tl, tl, size, confidence + get(points, hd, (tl, size, position)), position);
+}
+
+float get(point *points, int hd, int tl, int size, long position) {
+  if (hd == tl) return 0;
+  int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+  if (points[m].position == position)
+    return points[m].confidence;
+  if (points[m].position > position)
+    return get(points, hd, m, otl, size, confidence, position);
+  else
+    return get(points, m, tl, otl, size, confidence, position);
+}
+
+int split(int size) {
+  char *chunk;
+  point *points;
+  int hd, tl; // position of head and tail in the circular list points
+  char c, last = 0;
+  state s; 
+  int pos = 0, offset = 0;
+  int current = 0;
+  int nchunk = 0;
+  int indent = 0; last_indent = 0, n_newlines = 0, l_line = 0,
+    max_line = 0, l_first_word = 0;
+  int n_words = 0; // will be 0 or 1
+  int i;
+
+  chunk = malloc(size * sizeof(char));
+  points = malloc(size * sizeof(point));
+  if (!chunk || !points) {
+    perror("malloc");
+    return 1;
+  }
+
+  while (chunk[offset + (pos++) % size] = (c = getchar())) {
+    current = 0;
+
+    if (c == '\n') {
+      n_words = 0;
+      l_first_word = 0;
+      last_indent = indent;
+      indent = 0;
+      max_line = MAX(max_line, line);
+      l_line = 0;
+      if (last == '\n')
+        n_newlines++;
+      else
+        n_newlines = 1;
+      current += n_newlines;
+    } else {
+      l_line++;
+      if (c == ' ' || c == '\t') {
+        if (l_first_word == 0) {
+          indent += c == '\t' ? 2 : 1;
+        } else {
+          n_words = 1;
+        }
+      } else {
+        n_newlines = 0;
+        if (n_words == 0)
+          l_first_word++;
+      }
+    }
+
+    /* push point if we have one */
+    if (current > 0) {
+      push(points, hd, &tl, size, current, offset + pos);
+    }
+
+    /* cut if we have to */
+    if (pos == size || points[hd].confidence > MIN_CONFIDENCE) {
+      assert(hd != tl);
+      printf("-- %d %d %f\n",
+          nchunk, size + best - pos % size, points[hd].confidence);
+      for (i = offset; i < offset + best ; i++)
+        putchar(chunk[offset + i]);
+      offset = points[hd].position;
+      nchunk++;
+      hd = hd + 1 % size;
+    }
+    last = c;
+  }
+
+
+
+
+
+
+int main(int argc, char **argv) {
+  if (argc != 2) usage(argv);
+  size = atoi(argv[1]);
+  if (size <= 0) usage(argv);
+
+  return split(size);
+}
+

	nlsplit split natural language text in chunks at reasonable language boundaries
	git clone https://a3nm.net/git/nlsplit/
	Log \| Files \| Refs \| README