nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 20444efabe6dfe60dd0e961c9d84c271e3ff6446
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 00:17:50 +0200

initial commit

Diffstat:
nlsplit.c | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 144 insertions(+), 0 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -0,0 +1,144 @@ +#include <stdio.h> + +/* Split latin alphabet natural language text in fragments below a + * maximal size, with simple sensible heuristics. */ + +/* Above this confidence value, we always split */ +/* TODO make it a CLI arg */ +#define MIN_CONFIDENCE 2 + +#define TAB_LEN 2 + +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +typedef struct point { + float confidence; + long position; +} point; + +void usage(char** argv) { + fprintf(stderr, "Usage: %s BYTES\n", argv[0]); + exit(1); +} + + +// TODO: keep last insertion to insert new entry with combined score +int push2(point *points, int hd, int tl, int *otl, int size, float confidence, long position) { + if (hd == tl) { + points[tl].position = position; + if (tl == *otl && points[tl].confidence > confidence) { + points[tl].confidence = confidence; + tl = tl + 1 % size; + } else { + points[*tl].confidence = confidence; + } + *otl = tl; + return tl; + } else { + if (hd > tl) tl += size; + int m = (hd + tl + (hd > tl ? size : 0))/2 % size; + if (points[m].confidence <= confidence) + return push2(points, hd, m, otl, size, confidence, position); + else + return push2(points, m, tl, otl, size, confidence, position); + } +} + +int push(point *points, int hd, int *tl, int size, float confidence, long position) { + return push2(points, hd, *tl, tl, size, confidence + get(points, hd, (tl, size, position)), position); +} + +float get(point *points, int hd, int tl, int size, long position) { + if (hd == tl) return 0; + int m = (hd + tl + (hd > tl ? size : 0))/2 % size; + if (points[m].position == position) + return points[m].confidence; + if (points[m].position > position) + return get(points, hd, m, otl, size, confidence, position); + else + return get(points, m, tl, otl, size, confidence, position); +} + +int split(int size) { + char *chunk; + point *points; + int hd, tl; // position of head and tail in the circular list points + char c, last = 0; + state s; + int pos = 0, offset = 0; + int current = 0; + int nchunk = 0; + int indent = 0; last_indent = 0, n_newlines = 0, l_line = 0, + max_line = 0, l_first_word = 0; + int n_words = 0; // will be 0 or 1 + int i; + + chunk = malloc(size * sizeof(char)); + points = malloc(size * sizeof(point)); + if (!chunk || !points) { + perror("malloc"); + return 1; + } + + while (chunk[offset + (pos++) % size] = (c = getchar())) { + current = 0; + + if (c == '\n') { + n_words = 0; + l_first_word = 0; + last_indent = indent; + indent = 0; + max_line = MAX(max_line, line); + l_line = 0; + if (last == '\n') + n_newlines++; + else + n_newlines = 1; + current += n_newlines; + } else { + l_line++; + if (c == ' ' || c == '\t') { + if (l_first_word == 0) { + indent += c == '\t' ? 2 : 1; + } else { + n_words = 1; + } + } else { + n_newlines = 0; + if (n_words == 0) + l_first_word++; + } + } + + /* push point if we have one */ + if (current > 0) { + push(points, hd, &tl, size, current, offset + pos); + } + + /* cut if we have to */ + if (pos == size || points[hd].confidence > MIN_CONFIDENCE) { + assert(hd != tl); + printf("-- %d %d %f\n", + nchunk, size + best - pos % size, points[hd].confidence); + for (i = offset; i < offset + best ; i++) + putchar(chunk[offset + i]); + offset = points[hd].position; + nchunk++; + hd = hd + 1 % size; + } + last = c; + } + + + + + + +int main(int argc, char **argv) { + if (argc != 2) usage(argv); + size = atoi(argv[1]); + if (size <= 0) usage(argv); + + return split(size); +} +