commit bc6dbd4791c68e9460556665759b210a944c9401
parent a17b7c76a1711b656324f9839b757d9c5db030d8
Author: Antoine Amarilli <a3nm@a3nm.net>
Date: Wed, 7 Sep 2011 23:59:13 +0200
add NON_ALPHA_SCORE
Diffstat:
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/nlsplit.c b/nlsplit.c
@@ -41,6 +41,8 @@ int size;
#define NEWLINE_SCORE 2
/* whitespace is a last-resort split */
#define WHITESPACE_SCORE 1
+/* non-alpha is a last-resort split */
+#define NON_ALPHA_SCORE 0.5
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))
@@ -118,7 +120,6 @@ int push(point *points, int hd, int *tl, float confidence, long position, long o
return (*tl = push2(points, hd, *tl, confidence + old, position));
}
-
int split() {
/* perform the split */
@@ -237,6 +238,8 @@ int split() {
word_has_uppercase = 0;
if (c >= 'A' && c <= 'Z')
word_has_uppercase = 1;
+ if (!(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z'))
+ current += NON_ALPHA_SCORE;
// TODO more punct
if (c == '!' || c == '.' || c == ',' || c == ';' || c == '?' || c == '(' || c == ')')
last_punct = c;