commit 42f3053e824f20b198a2bd4e0485f347b4f60949
parent 337d2eec0e332936ebd4e8317dd8495f8fb175d4
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 7 Sep 2011 22:37:29 +0200
refactored, seems to work
Diffstat:
nlsplit.c | | | 47 | ++++++++++++++++++++++++++++++++++------------- |
1 file changed, 34 insertions(+), 13 deletions(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -6,17 +6,22 @@
* maximal size, with simple sensible heuristics. */
/* Above this confidence value, we always split */
-/* TODO make it a CLI arg */
-#define MIN_CONFIDENCE 99999
+float min_confidence = -1;
#define TAB_LEN 2
-#define EOF_CONFIDENCE 999
+
+#define EOF_SCORE 999999.
+#define MULTIPLE_LINE_SCORE 1000.
+#define LONG_LINES_THRESHOLD 100
+#define DELTA_THRESHOLD 10
+#define INDENT_CHANGE_SCORE 18
#define MAX_PUNCT_SITUATION_SCORE 3
-#define LINE_VALUE 100
-#define WHITESPACE_SCORE 1
#define FINAL_PUNCT_SCORE 20
#define SEMIFINAL_PUNCT_SCORE 18
#define OTHER_PUNCT_SCORE 15
+#define NEWLINE_WITH_NON_LOWERCASE_SCORE 3
+#define NEWLINE_SCORE 2
+#define WHITESPACE_SCORE 1
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -125,8 +130,13 @@ int split(int size) {
n_newlines = 1;
// TODO more value to single lines when max_line_len is >> 80
//current += LINE_VALUE * n_newlines;
- if (n_newlines > 1)
- current += LINE_VALUE * n_newlines;
+ if (n_newlines > 1) {
+ current += MULTIPLE_LINE_SCORE * n_newlines;
+ } else {
+ if (max_l_line > LONG_LINES_THRESHOLD)
+ current += (max_l_line - LONG_LINES_THRESHOLD);
+ current += NEWLINE_SCORE;
+ }
if (last_punct) whitespace_after_punct++;
} else {
if (c == ' ' || c == '\t') {
@@ -138,7 +148,7 @@ int split(int size) {
int delta = max_l_line - l_last_line - l_first_word - 1;
//printf("maxlline %d llastline %d lfirstword %d lline %d offset %d pos %d delta %d\n",
// max_l_line, l_last_line, l_first_word, l_line, offset, pos, delta);
- if (delta > 0) {
+ if (delta > DELTA_THRESHOLD) {
// first word of current line would fit on previous line
if (offset + pos - l_line - 1 > offset)
push(points, hd, &tl, size, delta, offset + pos - l_line - 1);
@@ -163,8 +173,13 @@ int split(int size) {
// TODO if first word fits
if (indent != last_indent)
if (offset + pos - l_line > offset)
- push(points, hd, &tl, size, abs(indent - last_indent), offset + pos - l_line);
+ push(points, hd, &tl, size, INDENT_CHANGE_SCORE + abs(indent - last_indent), offset + pos - l_line);
n_newlines = 0;
+ if (!n_words && !l_first_word)
+ // first char of the line
+ if (!(c >= 'a' && c <= 'z'))
+ if (offset + pos - l_line > offset)
+ push(points, hd, &tl, size, NEWLINE_WITH_NON_LOWERCASE_SCORE - NEWLINE_SCORE , offset + pos - l_line);
if (n_words == 0)
l_first_word++;
if (c >= 'a' && c <= 'z' && (last == ' ' || last == '\t' || last == '\n'))
@@ -184,7 +199,8 @@ int split(int size) {
assert(pos <= size);
/* cut if we have to */
- if (c == EOF || pos == size || (hd != tl && points[hd].confidence > MIN_CONFIDENCE)) {
+ if (c == EOF || pos == size ||
+ (hd != tl && min_confidence > 0 && points[hd].confidence > min_confidence)) {
// pop old entries
while (hd != tl && points[hd].position <= offset)
hd = (hd + 1) % size;
@@ -196,9 +212,9 @@ int split(int size) {
assert(points[hd].position > offset);
if (c == EOF) {
// special EOF cut
- push(points, hd, &tl, size, EOF_CONFIDENCE, offset + pos - 1);
+ points[hd].confidence = EOF_SCORE;
+ points[hd].position = offset + pos - 1;
}
- // TODO wtf '%' ?
//printf("== %d %d\n", offset, pos);
printf("-- %d %ld %f\n",
nchunk, points[hd].position - offset, points[hd].confidence);
@@ -270,8 +286,13 @@ int tests() {
int main(int argc, char **argv) {
int size;
- if (argc != 2) usage(argv);
+ if (argc < 2 || argc > 3) usage(argv);
size = atoi(argv[1]);
+ if (argc == 3) {
+ min_confidence = atoi(argv[2]);
+ if (!min_confidence) usage(argv);
+ }
+
if (size <= 0) usage(argv);
//return tests();