commit a17b7c76a1711b656324f9839b757d9c5db030d8
parent 42f3053e824f20b198a2bd4e0485f347b4f60949
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 7 Sep 2011 23:47:58 +0200
cleanup, comment
Diffstat:
nlsplit.c | | | 303 | ++++++++++++++++++++++++++++++++++++++++--------------------------------------- |
1 file changed, 153 insertions(+), 150 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -1,135 +1,188 @@
+/* nlsplit by a3nm (2011) */
+
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
-/* Split latin alphabet natural language text in fragments below a
- * maximal size, with simple sensible heuristics. */
+const char help[] =
+ "Split natural language text from stdin in pieces less than SIZE bytes.\n"
+ "If MINCONFIDENCE is set, always cut above this confidence threshold.\n";
-/* Above this confidence value, we always split */
+/* above this confidence value, we always split */
float min_confidence = -1;
+/* maximal size of pieces */
+int size;
+
+/* for tab-space equivalence in indentation counts */
#define TAB_LEN 2
-#define EOF_SCORE 999999.
-#define MULTIPLE_LINE_SCORE 1000.
+/* score of the last split */
+#define EOF_SCORE 999999
+/* score per \n, for multiple \n's */
+#define MULTIPLE_LINE_SCORE 1000
+/* if file contains lines longer than this, then one \n is a safer split */
#define LONG_LINES_THRESHOLD 100
+/* if first word of line fits on previous line with this margin, safer split */
#define DELTA_THRESHOLD 10
+/* if indentation changes, safer split */
#define INDENT_CHANGE_SCORE 18
-#define MAX_PUNCT_SITUATION_SCORE 3
+/* punctuations where split is safe */
#define FINAL_PUNCT_SCORE 20
+/* punctuations where split is less safe */
#define SEMIFINAL_PUNCT_SCORE 18
-#define OTHER_PUNCT_SCORE 15
+/* punctuations where split isn't really safe */
+#define OTHER_PUNCT_SCORE 10
+/* whitespace and case configurations can help punctuation splits up to this limit */
+#define MAX_PUNCT_SITUATION_SCORE 3
+/* one \n with next line starting by non-lowercase is a safer split */
#define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
+/* \n is a last-resort split */
#define NEWLINE_SCORE 2
+/* whitespace is a last-resort split */
#define WHITESPACE_SCORE 1
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MID(l, r, s) ((((l) + ((r) + ((l) > (r) ? (s) : 0)))/2) % (s))
+/* a possible split point */
typedef struct point {
float confidence;
long position;
} point;
void usage(char** argv) {
- fprintf(stderr, "Usage: %s BYTES\n", argv[0]);
+ /* show usage and exit */
+ fprintf(stderr, "Usage: %s SIZE [MINCONFIDENCE]\n", argv[0]);
+ fprintf(stderr, help);
exit(1);
}
-// TODO: keep last insertion to insert new entry with combined score
-int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) {
- //printf(". %d %d\n", hd, tl);
- assert(hd < size);
- assert(tl < size);
- if (hd == tl) {
- points[tl].position = position;
- if (tl == *otl) {
- points[tl].confidence = confidence;
- //tl = tl + 1 % size;
- } else {
- points[tl].confidence = confidence;
- }
- *otl = (tl + 1) % size;
- assert(*otl < size);
- return tl;
- } else {
- int m = ((hd + (tl + (hd > tl ? size : 0)))/2) % size;
- if (points[m].confidence <= confidence)
- return push2(points, hd, m, otl, size, confidence, position);
- else
- return push2(points, (m+1) % size, tl, otl, size, confidence, position);
- }
+float punct_score(char c) {
+ /* get score of a punctuation */
+ if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE;
+ if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE;
+ return OTHER_PUNCT_SCORE;
}
-float get(point *points, int hd, int tl, int size, long position) {
+float get(point *points, int hd, int tl, long position) {
+ /* binary search of position in circular buffer (points, hd, tl) */
+ /* return the confidence, or 0 if none is found */
+
if (hd == tl) return 0;
- int m = ((hd + tl + (hd > tl ? size : 0))/2) % size;
+ int m = MID(hd, tl, size);
if (points[m].position == position)
return points[m].confidence;
if (points[m].position > position)
- return get(points, hd, m, size, position);
+ return get(points, hd, m, position);
else
- return get(points, (m+1) % size, tl, size, position);
+ return get(points, (m+1) % size, tl, position);
}
+int push2(point *points, int hd, int tl, float confidence, long position) {
+ /* insert (confidence, position) in circular buffer (points, hd, tl) */
+ /* buffer is kept sorted, existing elements below confidence are erased) */
+ /* insertion position is found by binary search on confidence */
+ /* return new value for tl */
-int push(point *points, int hd, int *tl, int size, float confidence, long position) {
assert(hd < size);
- assert(*tl < size);
- //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl);
- float old = get(points, hd, *tl, size, position);
- return push2(points, hd, *tl, tl, size, confidence + old, position);
+ assert(tl < size);
+ if (hd == tl) {
+ /* insert at tl */
+ points[tl].position = position;
+ points[tl].confidence = confidence;
+ /* increment otl */
+ return (tl + 1) % size;
+ }
+ int m = MID(hd, tl, size);
+ /* in case of tie, drop elements */
+ if (points[m].confidence <= confidence)
+ return push2(points, hd, m, confidence, position);
+ else
+ return push2(points, (m+1) % size, tl, confidence, position);
}
-float punct_score(char c) {
- /* final punctuation */
- if (c == '?' || c == '!' || c == '.') return FINAL_PUNCT_SCORE;
- if (c == ',' || c == ';' || c == ':') return SEMIFINAL_PUNCT_SCORE;
- return OTHER_PUNCT_SCORE;
+int push(point *points, int hd, int *tl, float confidence, long position, long offset) {
+ /* insert (confidence, position) in circular buffer (points, hd, tl) */
+ /* if position is already in buffer, insert sum of confidence values */
+ /* perform update of tl, return new value of tl */
+
+ //printf("push %f %ld (%d %d)!\n", confidence, position, hd, *tl);
+ assert(hd < size);
+ assert(*tl < size);
+ // TODO check order
+ /* refuse insert of positions before offset */
+ if (position <= offset) return *tl;
+ float old = get(points, hd, *tl, position);
+ return (*tl = push2(points, hd, *tl, confidence + old, position));
}
-int split(int size) {
- char *chunk;
+int split() {
+ /* perform the split */
+
+ /* counter of pieces */
+ int npiece = 0;
+
+ /* circular buffer for the current piece */
+ char *piece;
+ /* offset is the absolute offset relative to BOF */
+ /* pos is the offset within the piece */
+ int offset = 0, pos = 0;
+
+ /* circular sorted buffer for the current split points */
+ /* this is kept sorted by decreasing confidence and increasing position */
point *points;
- int hd = 0, tl = 0; // position of head and tail in the circular list points
+ /* head and tail of the buffer */
+ int hd = 0, tl = 0;
+
+ /* current and last character */
char c, last = 0;
- int pos = 0, offset = 0;
+
+ /* confidence for split at current position */
float current = 0;
- int nchunk = 0;
- int indent = 0, last_indent = 0, n_newlines = 0, l_line = 0, l_last_line = 0,
- max_l_line = 0, l_first_word = 0;
- int n_words = 0; // will be 0 or 1
- int word_has_uppercase;
+
+ /* number of successive newlines, indentation of current and last line */
+ int n_newlines = 0, indent = 0, last_indent = 0;
+
+ /* length of current and last line, maximal line length */
+ int l_line = 0, l_last_line = 0, max_l_line = 0;
+
+ /* length of first word, number of words on line, uppercase in current word */
+ int l_first_word = 0, n_words = 0, word_has_uppercase = 0;
+
+ /* last punctuation seen, number of whitespace characters after punctuation */
int last_punct = 0, whitespace_after_punct = 0;
+
int i;
- chunk = malloc(size * sizeof(char));
+ /* allocate memory */
+ piece = malloc(size * sizeof(char));
points = malloc(size * sizeof(point));
- if (!chunk || !points) {
+ if (!piece || !points) {
perror("malloc");
return 1;
}
+ /* read file, break after splitting at EOF */
while (1) {
- c = getchar();
- chunk[(offset + (pos++)) % size] = c;
+ piece[(offset + (pos++)) % size] = (c = getchar());
+
current = 0;
+ if (c == '\n' || c == ' ' || c == '\t')
+ if (last_punct) whitespace_after_punct++;
+
if (c == '\n') {
n_words = 0;
l_first_word = 0;
last_indent = indent;
indent = 0;
- max_l_line = MAX(max_l_line, l_line);
l_last_line = l_line;
l_line = 0;
- if (last == '\n')
- n_newlines++;
- else
- n_newlines = 1;
- // TODO more value to single lines when max_line_len is >> 80
- //current += LINE_VALUE * n_newlines;
+ max_l_line = MAX(max_l_line, l_line);
+ n_newlines++;
if (n_newlines > 1) {
current += MULTIPLE_LINE_SCORE * n_newlines;
} else {
@@ -137,55 +190,54 @@ int split(int size) {
current += (max_l_line - LONG_LINES_THRESHOLD);
current += NEWLINE_SCORE;
}
- if (last_punct) whitespace_after_punct++;
} else {
+ n_newlines = 0;
if (c == ' ' || c == '\t') {
current += WHITESPACE_SCORE;
if (l_first_word == 0) {
+ /* we are still in indentation */
indent += c == '\t' ? 2 : 1;
} else {
if (!n_words) {
+ /* we have just read the first word */
int delta = max_l_line - l_last_line - l_first_word - 1;
//printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n",
// max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta);
if (delta > DELTA_THRESHOLD) {
- // first word of current line would fit on previous line
- if (offset + pos - l_line - 1 > offset)
- push(points, hd, &tl, size, delta, offset + pos - l_line - 1);
+ /* first word of current line would fit on previous line */
+ push(points, hd, &tl, delta, offset + pos - l_line - 1, offset);
}
}
- n_words = 1;
+ n_words += 1;
}
- if (last_punct) whitespace_after_punct++;
} else {
if (last_punct) {
- // the best situation for a punctuation is to be followed by
- // much whitespace and then an uppercase letter, and have a
- // preceding word without uppercase ("Mr. Brown")
- int situation_score = MIN(whitespace_after_punct + ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1), MAX_PUNCT_SITUATION_SCORE);
- if (offset + pos - whitespace_after_punct > offset) {
- push(points, hd, &tl, size, situation_score + punct_score(last_punct), offset + pos - whitespace_after_punct);
- }
+ /* punctuation followed by much whitespace and an uppercase letter */
+ /* and having a preceding word without uppercase is safer */
+ int situation_score = MIN(MAX_PUNCT_SITUATION_SCORE,
+ whitespace_after_punct +
+ ((c >= 'A' && c <= 'Z') ? 1 : 0) + (word_has_uppercase ? 0 : 1));
+ push(points, hd, &tl, situation_score + punct_score(last_punct),
+ offset + pos - whitespace_after_punct, offset);
whitespace_after_punct = 0;
last_punct = 0;
}
- // TODO if first word fits
if (indent != last_indent)
- if (offset + pos - l_line > offset)
- push(points, hd, &tl, size, INDENT_CHANGE_SCORE + abs(indent - last_indent), offset + pos - l_line);
- n_newlines = 0;
+ push(points, hd, &tl, INDENT_CHANGE_SCORE + abs(indent -
+ last_indent), offset + pos - l_line, offset);
if (!n_words && !l_first_word)
- // first char of the line
+ /* first char of the line */
if (!(c >= 'a' && c <= 'z'))
- if (offset + pos - l_line > offset)
- push(points, hd, &tl, size, NEWLINE_WITH_NON_LOWERCASE_SCORE - NEWLINE_SCORE , offset + pos - l_line);
+ push(points, hd, &tl, NEWLINE_WITH_NON_LOWERCASE_SCORE -
+ NEWLINE_SCORE , offset + pos - l_line, offset);
if (n_words == 0)
l_first_word++;
if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
word_has_uppercase = 0;
if (c >= 'A' && c <= 'Z')
word_has_uppercase = 1;
+ // TODO more punct
if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')')
last_punct = c;
}
@@ -194,108 +246,59 @@ int split(int size) {
/* push point if we have one */
if (current > 0) {
- push(points, hd, &tl, size, current, offset + pos);
+ push(points, hd, &tl, current, offset + pos, offset);
}
assert(pos <= size);
/* cut if we have to */
if (c == EOF || pos == size ||
- (hd != tl && min_confidence > 0 && points[hd].confidence > min_confidence)) {
- // pop old entries
+ (hd != tl && min_confidence > 0 && points[hd].confidence >= min_confidence)) {
+ /* pop old entries */
while (hd != tl && points[hd].position <= offset)
hd = (hd + 1) % size;
if (hd == tl) {
- // we have no possibility to cut, we must add one
- push(points, hd, &tl, size, 0, offset + pos);
+ /* we have no points, we must create one */
+ push(points, hd, &tl, 0, offset + pos, offset);
assert(points[hd].position > offset);
}
assert(points[hd].position > offset);
if (c == EOF) {
- // special EOF cut
+ /* special EOF cut */
points[hd].confidence = EOF_SCORE;
points[hd].position = offset + pos - 1;
}
//printf("== %d %d\n", offset, pos);
- printf("-- %d %ld %f\n",
- nchunk, points[hd].position - offset, points[hd].confidence);
+ printf("-- piece %d length %ld confidence %f\n",
+ npiece, points[hd].position - offset, points[hd].confidence);
+ /* output the data */
for (i = offset; i < points[hd].position ; i++)
- putchar(chunk[i % size]);
+ putchar(piece[i % size]);
putchar('\n');
+ /* update offset and pos */
pos = (offset + pos) - points[hd].position;
assert(pos < size);
offset = points[hd].position;
//printf("== %d %d\n", offset, pos);
- nchunk++;
+ /* pop the point */
hd = (hd + 1) % size;
+ /* increment piece counter */
+ npiece++;
}
last = c;
if (c == EOF) break;
}
- //printf("==done %d %d\n", offset, pos);
-
- return 0;
-}
-
-void print_points(point *points, int hd, int tl, int size) {
- int i;
- printf("size %d hd %d tl %d\n", size, hd, tl);
- for (i=0; i<(tl-hd+size)%size; i++)
- printf("> item %d pos %d; %f %ld\n", i, (hd+i)%size, points[(hd+i)%size].confidence, points[(hd+i)%size].position);
-}
-int tests() {
- float f;
- const int size = 3;
- int pos;
- point p[size];
- int hd = 0, tl = 0;
- f = get(p, hd, tl, size, 2);
- print_points(p, hd, tl, size);
- push2(p, hd, tl, &tl, size, 1, 1);
- print_points(p, hd, tl, size);
- f = get(p, hd, tl, size, 1);
- printf("%f\n", f);
- push(p, hd, &tl, size, 1, 1);
- print_points(p, hd, tl, size);
- push(p, hd, &tl, size, 1, 1);
- print_points(p, hd, tl, size);
- f = get(p, hd, tl, size, 2);
- printf("%f\n", f);
- push(p, hd, &tl, size, 2, 4);
- print_points(p, hd, tl, size);
- push(p, hd, &tl, size, 3, 2);
- print_points(p, hd, tl, size);
- push(p, hd, &tl, size, 4, 1);
- print_points(p, hd, tl, size);
- printf("pop!\n");
- hd = (hd + 1) % size;
- push(p, hd, &tl, size, 4, 2);
- print_points(p, hd, tl, size);
- pos = push(p, hd, &tl, size, 1, 4242);
- printf("inserted at %d\n", pos);
- print_points(p, hd, tl, size);
- printf("pop!\n");
- hd = (hd + 1) % size;
- push(p, hd, &tl, size, 4, 4);
- print_points(p, hd, tl, size);
- printf("pop!\n");
- hd = (hd + 1) % size;
return 0;
}
int main(int argc, char **argv) {
- int size;
if (argc < 2 || argc > 3) usage(argv);
size = atoi(argv[1]);
- if (argc == 3) {
- min_confidence = atoi(argv[2]);
- if (!min_confidence) usage(argv);
- }
-
if (size <= 0) usage(argv);
+ if (argc == 3) min_confidence = atoi(argv[2]);
+ if (!min_confidence) usage(argv);
- //return tests();
return split(size);
}