nlsplit

split natural language text in chunks at reasonable language boundaries
git clone https://a3nm.net/git/nlsplit/
Log | Files | Refs | README

commit 42f3053e824f20b198a2bd4e0485f347b4f60949
parent 337d2eec0e332936ebd4e8317dd8495f8fb175d4
Author: Antoine Amarilli <a3nm@a3nm.net>
Date:   Wed,  7 Sep 2011 22:37:29 +0200

refactored, seems to work

Diffstat:
nlsplit.c | 47++++++++++++++++++++++++++++++++++-------------
1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/nlsplit.c b/nlsplit.c @@ -6,17 +6,22 @@ * maximal size, with simple sensible heuristics. */ /* Above this confidence value, we always split */ -/* TODO make it a CLI arg */ -#define MIN_CONFIDENCE 99999 +float min_confidence = -1; #define TAB_LEN 2 -#define EOF_CONFIDENCE 999 + +#define EOF_SCORE 999999. +#define MULTIPLE_LINE_SCORE 1000. +#define LONG_LINES_THRESHOLD 100 +#define DELTA_THRESHOLD 10 +#define INDENT_CHANGE_SCORE 18 #define MAX_PUNCT_SITUATION_SCORE 3 -#define LINE_VALUE 100 -#define WHITESPACE_SCORE 1 #define FINAL_PUNCT_SCORE 20 #define SEMIFINAL_PUNCT_SCORE 18 #define OTHER_PUNCT_SCORE 15 +#define NEWLINE_WITH_NON_LOWERCASE_SCORE 3 +#define NEWLINE_SCORE 2 +#define WHITESPACE_SCORE 1 #define MAX(a, b) (((a) > (b)) ? (a) : (b)) #define MIN(a, b) (((a) < (b)) ? (a) : (b)) @@ -125,8 +130,13 @@ int split(int size) { n_newlines = 1; // TODO more value to single lines when max_line_len is >> 80 //current += LINE_VALUE * n_newlines; - if (n_newlines > 1) - current += LINE_VALUE * n_newlines; + if (n_newlines > 1) { + current += MULTIPLE_LINE_SCORE * n_newlines; + } else { + if (max_l_line > LONG_LINES_THRESHOLD) + current += (max_l_line - LONG_LINES_THRESHOLD); + current += NEWLINE_SCORE; + } if (last_punct) whitespace_after_punct++; } else { if (c == ' ' || c == '\t') { @@ -138,7 +148,7 @@ int split(int size) { int delta = max_l_line - l_last_line - l_first_word - 1; //printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n", // max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta); - if (delta > 0) { + if (delta > DELTA_THRESHOLD) { // first word of current line would fit on previous line if (offset + pos - l_line - 1 > offset) push(points, hd, &tl, size, delta, offset + pos - l_line - 1); @@ -163,8 +173,13 @@ int split(int size) { // TODO if first word fits if (indent != last_indent) if (offset + pos - l_line > offset) - push(points, hd, &tl, size, abs(indent - last_indent), offset + pos - l_line); + push(points, hd, &tl, size, INDENT_CHANGE_SCORE + abs(indent - last_indent), offset + pos - l_line); n_newlines = 0; + if (!n_words && !l_first_word) + // first char of the line + if (!(c >= 'a' && c <= 'z')) + if (offset + pos - l_line > offset) + push(points, hd, &tl, size, NEWLINE_WITH_NON_LOWERCASE_SCORE - NEWLINE_SCORE , offset + pos - l_line); if (n_words == 0) l_first_word++; if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n')) @@ -184,7 +199,8 @@ int split(int size) { assert(pos <= size); /* cut if we have to */ - if (c == EOF || pos == size || (hd != tl && points[hd].confidence > MIN_CONFIDENCE)) { + if (c == EOF || pos == size || + (hd != tl && min_confidence > 0 && points[hd].confidence > min_confidence)) { // pop old entries while (hd != tl && points[hd].position <= offset) hd = (hd + 1) % size; @@ -196,9 +212,9 @@ int split(int size) { assert(points[hd].position > offset); if (c == EOF) { // special EOF cut - push(points, hd, &tl, size, EOF_CONFIDENCE, offset + pos - 1); + points[hd].confidence = EOF_SCORE; + points[hd].position = offset + pos - 1; } - // TODO wtf '%' ? //printf("== %d %d\n", offset, pos); printf("-- %d %ld %f\n", nchunk, points[hd].position - offset, points[hd].confidence); @@ -270,8 +286,13 @@ int tests() { int main(int argc, char **argv) { int size; - if (argc != 2) usage(argv); + if (argc < 2 || argc > 3) usage(argv); size = atoi(argv[1]); + if (argc == 3) { + min_confidence = atoi(argv[2]); + if (!min_confidence) usage(argv); + } + if (size <= 0) usage(argv); //return tests();