commit 337d2eec0e332936ebd4e8317dd8495f8fb175d4
parent 6e691b2400272e5c72d403124118a6c32f923f12
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 7 Sep 2011 20:32:12 +0200
first version
Diffstat:
nlsplit.c | | | 202 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- |
1 file changed, 169 insertions(+), 33 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -1,15 +1,25 @@
#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
/* Split latin alphabet natural language text in fragments below a
* maximal size, with simple sensible heuristics. */
/* Above this confidence value, we always split */
/* TODO make it a CLI arg */
-#define MIN_CONFIDENCE 2
+#define MIN_CONFIDENCE 99999
#define TAB_LEN 2
+#define EOF_CONFIDENCE 999
+#define MAX_PUNCT_SITUATION_SCORE 3
+#define LINE_VALUE 100
+#define WHITESPACE_SCORE 1
+#define FINAL_PUNCT_SCORE 20
+#define SEMIFINAL_PUNCT_SCORE 18
+#define OTHER_PUNCT_SCORE 15
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
typedef struct point {
float confidence;
@@ -21,56 +31,72 @@ void usage(char** argv) {
exit(1);
}
-
// TODO: keep last insertion to insert new entry with combined score
int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) {
+ //printf(". %d %d\n", hd, tl);
+ assert(hd < size);
+ assert(tl < size);
if (hd == tl) {
points[tl].position = position;
- if (tl == *otl && points[tl].confidence > confidence) {
+ if (tl == *otl) {
points[tl].confidence = confidence;
- tl = tl + 1 % size;
+ //tl = tl + 1 % size;
} else {
points[tl].confidence = confidence;
}
- *otl = tl;
+ *otl = (tl + 1) % size;
+ assert(*otl < size);
return tl;
} else {
- if (hd > tl) tl += size;
- int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+ int m = ((hd + (tl + (hd > tl ? size : 0)))/2) % size;
if (points[m].confidence <= confidence)
return push2(points, hd, m, otl, size, confidence, position);
else
- return push2(points, m, tl, otl, size, confidence, position);
+ return push2(points, (m+1) % size, tl, otl, size, confidence, position);
}
}
-int push(point *points, int hd, int *tl, int size, float confidence, long position) {
- return push2(points, hd, *tl, tl, size, confidence + get(points, hd, (tl, size, position)), position);
-}
-
float get(point *points, int hd, int tl, int size, long position) {
if (hd == tl) return 0;
- int m = (hd + tl + (hd > tl ? size : 0))/2 % size;
+ int m = ((hd + tl + (hd > tl ? size : 0))/2) % size;
if (points[m].position == position)
return points[m].confidence;
if (points[m].position > position)
- return get(points, hd, m, otl, size, confidence, position);
+ return get(points, hd, m, size, position);
else
- return get(points, m, tl, otl, size, confidence, position);
+ return get(points, (m+1) % size, tl, size, position);
+}
+
+
+int push(point *points, int hd, int *tl, int size, float confidence, long position) {
+ assert(hd < size);
+ assert(*tl < size);
+ //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl);
+ float old = get(points, hd, *tl, size, position);
+ return push2(points, hd, *tl, tl, size, confidence + old, position);
+}
+
+float punct_score(char c) {
+ /* final punctuation */
+ if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE;
+ if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE;
+ return OTHER_PUNCT_SCORE;
}
+
int split(int size) {
char *chunk;
point *points;
- int hd, tl; // position of head and tail in the circular list points
+ int hd = 0, tl = 0; // position of head and tail in the circular list points
char c, last = 0;
- state s;
int pos = 0, offset = 0;
- int current = 0;
+ float current = 0;
int nchunk = 0;
- int indent = 0; last_indent = 0, n_newlines = 0, l_line = 0,
- max_line = 0, l_first_word = 0;
+ int indent = 0, last_indent = 0, n_newlines = 0, l_line = 0, l_last_line = 0,
+ max_l_line = 0, l_first_word = 0;
int n_words = 0; // will be 0 or 1
+ int word_has_uppercase;
+ int last_punct = 0, whitespace_after_punct = 0;
int i;
chunk = malloc(size * sizeof(char));
@@ -80,7 +106,9 @@ int split(int size) {
return 1;
}
- while (chunk[offset + (pos++) % size] = (c = getchar())) {
+ while (1) {
+ c = getchar();
+ chunk[(offset + (pos++)) % size] = c;
current = 0;
if (c == '\n') {
@@ -88,26 +116,65 @@ int split(int size) {
l_first_word = 0;
last_indent = indent;
indent = 0;
- max_line = MAX(max_line, line);
+ max_l_line = MAX(max_l_line, l_line);
+ l_last_line = l_line;
l_line = 0;
if (last == '\n')
n_newlines++;
else
n_newlines = 1;
- current += n_newlines;
+ // TODO more value to single lines when max_line_len is >> 80
+ //current += LINE_VALUE * n_newlines;
+ if (n_newlines > 1)
+ current += LINE_VALUE * n_newlines;
+ if (last_punct) whitespace_after_punct++;
} else {
- l_line++;
if (c == ' ' || c == '\t') {
+ current += WHITESPACE_SCORE;
if (l_first_word == 0) {
indent += c == '\t' ? 2 : 1;
} else {
+ if (!n_words) {
+ int delta = max_l_line - l_last_line - l_first_word - 1;
+ //printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n",
+ // max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta);
+ if (delta > 0) {
+ // first word of current line would fit on previous line
+ if (offset + pos - l_line - 1 > offset)
+ push(points, hd, &tl, size, delta, offset + pos - l_line - 1);
+ }
+ }
n_words = 1;
}
+ if (last_punct) whitespace_after_punct++;
} else {
+ if (last_punct) {
+ // the best situation for a punctuation is to be followed by
+ // much whitespace and then an uppercase letter, and have a
+ // preceding word without uppercase ("Mr. Brown")
+ int situation_score = MIN(whitespace_after_punct + ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1), MAX_PUNCT_SITUATION_SCORE);
+ if (offset + pos - whitespace_after_punct > offset) {
+ push(points, hd, &tl, size, situation_score + punct_score(last_punct), offset + pos - whitespace_after_punct);
+ }
+ whitespace_after_punct = 0;
+ last_punct = 0;
+ }
+
+ // TODO if first word fits
+ if (indent != last_indent)
+ if (offset + pos - l_line > offset)
+ push(points, hd, &tl, size, abs(indent - last_indent), offset + pos - l_line);
n_newlines = 0;
if (n_words == 0)
l_first_word++;
+ if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
+ word_has_uppercase = 0;
+ if (c >= 'A' && c <= 'Z')
+ word_has_uppercase = 1;
+ if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')')
+ last_punct = c;
}
+ l_line++;
}
/* push point if we have one */
@@ -115,30 +182,99 @@ int split(int size) {
push(points, hd, &tl, size, current, offset + pos);
}
+ assert(pos <= size);
/* cut if we have to */
- if (pos == size || points[hd].confidence > MIN_CONFIDENCE) {
- assert(hd != tl);
- printf("-- %d %d %f\n",
- nchunk, size + best - pos % size, points[hd].confidence);
- for (i = offset; i < offset + best ; i++)
- putchar(chunk[offset + i]);
+ if (c == EOF || pos == size || (hd != tl && points[hd].confidence > MIN_CONFIDENCE)) {
+ // pop old entries
+ while (hd != tl && points[hd].position <= offset)
+ hd = (hd + 1) % size;
+ if (hd == tl) {
+ // we have no possibility to cut, we must add one
+ push(points, hd, &tl, size, 0, offset + pos);
+ assert(points[hd].position > offset);
+ }
+ assert(points[hd].position > offset);
+ if (c == EOF) {
+ // special EOF cut
+ push(points, hd, &tl, size, EOF_CONFIDENCE, offset + pos - 1);
+ }
+ // TODO wtf '%' ?
+ //printf("== %d %d\n", offset, pos);
+ printf("-- %d %ld %f\n",
+ nchunk, points[hd].position - offset, points[hd].confidence);
+ for (i = offset; i < points[hd].position ; i++)
+ putchar(chunk[i % size]);
+ putchar('\n');
+ pos = (offset + pos) - points[hd].position;
+ assert(pos < size);
offset = points[hd].position;
+ //printf("== %d %d\n", offset, pos);
nchunk++;
- hd = hd + 1 % size;
+ hd = (hd + 1) % size;
}
last = c;
+ if (c == EOF) break;
}
+ //printf("==done %d %d\n", offset, pos);
+ return 0;
+}
+void print_points(point *points, int hd, int tl, int size) {
+ int i;
+ printf("size %d hd %d tl %d\n", size, hd, tl);
+ for (i=0; i<(tl-hd+size)%size; i++)
+ printf("> item %d pos %d; %f %ld\n", i, (hd+i)%size, points[(hd+i)%size].confidence, points[(hd+i)%size].position);
+}
-
-
+int tests() {
+ float f;
+ const int size = 3;
+ int pos;
+ point p[size];
+ int hd = 0, tl = 0;
+ f = get(p, hd, tl, size, 2);
+ print_points(p, hd, tl, size);
+ push2(p, hd, tl, &tl, size, 1, 1);
+ print_points(p, hd, tl, size);
+ f = get(p, hd, tl, size, 1);
+ printf("%f\n", f);
+ push(p, hd, &tl, size, 1, 1);
+ print_points(p, hd, tl, size);
+ push(p, hd, &tl, size, 1, 1);
+ print_points(p, hd, tl, size);
+ f = get(p, hd, tl, size, 2);
+ printf("%f\n", f);
+ push(p, hd, &tl, size, 2, 4);
+ print_points(p, hd, tl, size);
+ push(p, hd, &tl, size, 3, 2);
+ print_points(p, hd, tl, size);
+ push(p, hd, &tl, size, 4, 1);
+ print_points(p, hd, tl, size);
+ printf("pop!\n");
+ hd = (hd + 1) % size;
+ push(p, hd, &tl, size, 4, 2);
+ print_points(p, hd, tl, size);
+ pos = push(p, hd, &tl, size, 1, 4242);
+ printf("inserted at %d\n", pos);
+ print_points(p, hd, tl, size);
+ printf("pop!\n");
+ hd = (hd + 1) % size;
+ push(p, hd, &tl, size, 4, 4);
+ print_points(p, hd, tl, size);
+ printf("pop!\n");
+ hd = (hd + 1) % size;
+ return 0;
+}
int main(int argc, char **argv) {
+ int size;
+
if (argc != 2) usage(argv);
size = atoi(argv[1]);
if (size <= 0) usage(argv);
+ //return tests();
return split(size);
}