commit 20444efabe6dfe60dd0e961c9d84c271e3ff6446
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 7 Sep 2011 00:17:50 +0200
initial commit
Diffstat:
nlsplit.c | | | 144 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 144 insertions(+), 0 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -0,0 +1,144 @@
+#include <stdio.h>
+
+/* Split latin alphabet natural language text in fragments below a
+ * maximal size, with simple sensible heuristics. */
+
+/* Above this confidence value, we always split */
+/* TODO make it a CLI arg */
+#define MIN_CONFIDENCE 2
+
+#define TAB_LEN 2
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+typedef struct point {
+ float confidence;
+ long position;
+} point;
+
+void usage(char** argv) {
+ fprintf(stderr, "Usage: %s BYTES\n", argv[0]);
+ exit(1);
+}
+
+
+// TODO: keep last insertion to insert new entry with combined score
+int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) {
+ if (hd == tl) {
+ points[tl].position = position;
+ if (tl == *otl && points[tl].confidence > confidence) {
+ points[tl].confidence = confidence;
+ tl = tl + 1 % size;
+ } else {
+ points[*tl].confidence = confidence;
+ }
+ *otl = tl;
+ return tl;
+ } else {
+ if (hd > tl) tl += size;
+ int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+ if (points[m].confidence <= confidence)
+ return push2(points, hd, m, otl, size, confidence, position);
+ else
+ return push2(points, m, tl, otl, size, confidence, position);
+ }
+}
+
+int push(point *points, int hd, int *tl, int size, float confidence, long position) {
+ return push2(points, hd, *tl, tl, size, confidence + get(points, hd, (tl, size, position)), position);
+}
+
+float get(point *points, int hd, int tl, int size, long position) {
+ if (hd == tl) return 0;
+ int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+ if (points[m].position == position)
+ return points[m].confidence;
+ if (points[m].position > position)
+ return get(points, hd, m, otl, size, confidence, position);
+ else
+ return get(points, m, tl, otl, size, confidence, position);
+}
+
+int split(int size) {
+ char *chunk;
+ point *points;
+ int hd, tl; // position of head and tail in the circular list points
+ char c, last = 0;
+ state s;
+ int pos = 0, offset = 0;
+ int current = 0;
+ int nchunk = 0;
+ int indent = 0; last_indent = 0, n_newlines = 0, l_line = 0,
+ max_line = 0, l_first_word = 0;
+ int n_words = 0; // will be 0 or 1
+ int i;
+
+ chunk = malloc(size * sizeof(char));
+ points = malloc(size * sizeof(point));
+ if (!chunk || !points) {
+ perror("malloc");
+ return 1;
+ }
+
+ while (chunk[offset + (pos++) % size] = (c = getchar())) {
+ current = 0;
+
+ if (c == '\n') {
+ n_words = 0;
+ l_first_word = 0;
+ last_indent = indent;
+ indent = 0;
+ max_line = MAX(max_line, line);
+ l_line = 0;
+ if (last == '\n')
+ n_newlines++;
+ else
+ n_newlines = 1;
+ current += n_newlines;
+ } else {
+ l_line++;
+ if (c == ' ' || c == '\t') {
+ if (l_first_word == 0) {
+ indent += c == '\t' ? 2 : 1;
+ } else {
+ n_words = 1;
+ }
+ } else {
+ n_newlines = 0;
+ if (n_words == 0)
+ l_first_word++;
+ }
+ }
+
+ /* push point if we have one */
+ if (current > 0) {
+ push(points, hd, &tl, size, current, offset + pos);
+ }
+
+ /* cut if we have to */
+ if (pos == size || points[hd].confidence > MIN_CONFIDENCE) {
+ assert(hd != tl);
+ printf("-- %d %d %f\n",
+ nchunk, size + best - pos % size, points[hd].confidence);
+ for (i = offset; i < offset + best ; i++)
+ putchar(chunk[offset + i]);
+ offset = points[hd].position;
+ nchunk++;
+ hd = hd + 1 % size;
+ }
+ last = c;
+ }
+
+
+
+
+
+
+int main(int argc, char **argv) {
+ if (argc != 2) usage(argv);
+ size = atoi(argv[1]);
+ if (size <= 0) usage(argv);
+
+ return split(size);
+}
+